diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3d651f43b4d..0a4beb50708 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -207,6 +207,35 @@ jobs: docker kill $(docker ps -q) ||: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH + StressTestDebug: + needs: [BuilderDebDebug] + runs-on: [self-hosted, stress-tester] + steps: + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{runner.temp}}/reports_dir + - name: Check out repository code + uses: actions/checkout@v2 + - name: Stress test + env: + TEMP_PATH: ${{runner.temp}}/stress_debug + REPORTS_PATH: ${{runner.temp}}/reports_dir + CHECK_NAME: 'Stress tests (debug, actions)' + REPO_COPY: ${{runner.temp}}/stress_debug/ClickHouse + REQUIRED_BUILD_NUMBER: 7 + run: | + sudo rm -fr $TEMP_PATH + mkdir -p $TEMP_PATH + cp -r $GITHUB_WORKSPACE $TEMP_PATH + cd $REPO_COPY/tests/ci + python3 stress_check.py "$CHECK_NAME" $REQUIRED_BUILD_NUMBER + - name: Cleanup + if: always() + run: | + docker kill $(docker ps -q) ||: + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr $TEMP_PATH FastTest: needs: DockerHubPush if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }} @@ -231,7 +260,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr $TEMP_PATH FinishCheck: - needs: [StyleCheck, DockerHubPush, CheckLabels, BuilderReport, FastTest, FunctionalStatelessTestDebug, FunctionalStatefulTestDebug, DocsCheck] + needs: [StyleCheck, DockerHubPush, CheckLabels, BuilderReport, FastTest, FunctionalStatelessTestDebug, FunctionalStatefulTestDebug, DocsCheck, StressTestDebug] runs-on: [self-hosted, style-checker] steps: - name: Check out repository code diff --git a/.gitmodules b/.gitmodules index e0404c1269d..8ad81b5094f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -76,7 +76,7 @@ url = https://github.com/ClickHouse-Extras/libcxxabi.git [submodule "contrib/snappy"] path = contrib/snappy - url = https://github.com/google/snappy + url = https://github.com/ClickHouse-Extras/snappy.git [submodule "contrib/cppkafka"] path = contrib/cppkafka url = https://github.com/mfontanini/cppkafka.git diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index dd6ead7d97f..f13110d7179 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54456) +SET(VERSION_REVISION 54457) SET(VERSION_MAJOR 21) -SET(VERSION_MINOR 11) +SET(VERSION_MINOR 12) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH 7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7) -SET(VERSION_DESCRIBE v21.11.1.1-prestable) -SET(VERSION_STRING 21.11.1.1) +SET(VERSION_GITHASH 503a418dedf0011e9040c3a1b6913e0b5488be4c) +SET(VERSION_DESCRIBE v21.12.1.1-prestable) +SET(VERSION_STRING 21.12.1.1) # end of autochange diff --git a/contrib/snappy b/contrib/snappy index 3f194acb57e..fb057edfed8 160000 --- a/contrib/snappy +++ b/contrib/snappy @@ -1 +1 @@ -Subproject commit 3f194acb57e0487531c96b97af61dcbd025a78a3 +Subproject commit fb057edfed820212076239fd32cb2ff23e9016bf diff --git a/debian/changelog b/debian/changelog index 460424bdb36..a2709485e44 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (21.11.1.1) unstable; urgency=low +clickhouse (21.12.1.1) unstable; urgency=low * Modified source code - -- clickhouse-release Thu, 09 Sep 2021 12:03:26 +0300 + -- clickhouse-release Tue, 02 Nov 2021 00:56:42 +0300 diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index 926014da9e6..6f9a957852e 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.com/deb/stable/ main/" -ARG version=21.11.1.* +ARG version=21.12.1.* RUN sed -i 's|http://archive|http://ru.archive|g' /etc/apt/sources.list diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index 48b59d1e754..04842e7a3de 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:20.04 ARG repository="deb https://repo.clickhouse.com/deb/stable/ main/" -ARG version=21.11.1.* +ARG version=21.12.1.* ARG gosu_ver=1.10 # set non-empty deb_location_url url to create a docker image diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile index fc8c2ebbe6e..76967da9f9a 100644 --- a/docker/test/Dockerfile +++ b/docker/test/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.com/deb/stable/ main/" -ARG version=21.11.1.* +ARG version=21.12.1.* RUN apt-get update && \ apt-get install -y apt-transport-https dirmngr && \ diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh index 38595d47528..2e8517d71c9 100755 --- a/docker/test/performance-comparison/compare.sh +++ b/docker/test/performance-comparison/compare.sh @@ -634,7 +634,7 @@ create view query_display_names as select * from create view partial_query_times as select * from file('analyze/partial-query-times.tsv', TSVWithNamesAndTypes, - 'test text, query_index int, time_stddev float, time_median float') + 'test text, query_index int, time_stddev float, time_median double') ; -- Report for partial queries that we could only run on the new server (e.g. diff --git a/docker/test/stress/Dockerfile b/docker/test/stress/Dockerfile index 3fe1b790d5a..e28d25c9485 100644 --- a/docker/test/stress/Dockerfile +++ b/docker/test/stress/Dockerfile @@ -26,4 +26,6 @@ COPY ./stress /stress COPY run.sh / ENV DATASETS="hits visits" +ENV S3_URL="https://clickhouse-datasets.s3.yandex.net" + CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index a4e2f797210..04845f2a4d1 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -112,7 +112,7 @@ configure start # shellcheck disable=SC2086 # No quotes because I want to split it into words. -/s3downloader --dataset-names $DATASETS +/s3downloader --url-prefix "$S3_URL" --dataset-names $DATASETS chmod 777 -R /var/lib/clickhouse clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary" clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test" diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index e67c521c4a7..5b7213d0bfe 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -16,10 +16,13 @@ The supported formats are: | [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [TabSeparatedRawWithNames](#tabseparatedrawwithnames) | ✔ | ✔ | +| [TabSeparatedRawWithNamesAndTypes](#tabseparatedrawwithnamesandtypes) | ✔ | ✔ | | [Template](#format-template) | ✔ | ✔ | | [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CSVWithNamesAndTypes](#csvwithnamesandtypes) | ✔ | ✔ | | [CustomSeparated](#format-customseparated) | ✔ | ✔ | | [Values](#data-format-values) | ✔ | ✔ | | [Vertical](#vertical) | ✗ | ✔ | @@ -33,8 +36,10 @@ The supported formats are: | [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ | | [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ | | [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ | +| [JSONCompactEachRowWithNames](#jsoncompacteachrowwithnames) | ✔ | ✔ | | [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ | | [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ | +| [JSONCompactStringsEachRowWithNames](#jsoncompactstringseachrowwithnames) | ✔ | ✔ | | [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ | | [TSKV](#tskv) | ✔ | ✔ | | [Pretty](#pretty) | ✗ | ✔ | @@ -51,6 +56,7 @@ The supported formats are: | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | | [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | +| [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | | [Null](#null) | ✗ | ✔ | @@ -126,6 +132,9 @@ Arrays are written as a list of comma-separated values in square brackets. Numbe [NULL](../sql-reference/syntax.md) is formatted as `\N`. +If setting [input_format_tsv_empty_as_default](../operations/settings/settings.md#settings-input_format_tsv_empty_as_default) is enabled, +empty input fields are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too. + Each element of [Nested](../sql-reference/data-types/nested-data-structures/nested.md) structures is represented as array. For example: @@ -164,17 +173,35 @@ This format is also available under the name `TSVRaw`. ## TabSeparatedWithNames {#tabseparatedwithnames} Differs from the `TabSeparated` format in that the column names are written in the first row. -During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness. +If setting [input_format_with_names_use_header](../operations/settings/settings.md#settings-input_format_with_names_use_header) is set to 1, +the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](../operations/settings/settings.md#settings-input_format_skip_unknown_fields) is set to 1. +Otherwise, the first row will be skipped. This format is also available under the name `TSVWithNames`. ## TabSeparatedWithNamesAndTypes {#tabseparatedwithnamesandtypes} Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row. -During parsing, the first and second rows are completely ignored. +The first row with names is processed the same way as in `TabSeparatedWithNames` format. +If setting [input_format_with_types_use_header](../operations/settings/settings.md#settings-input_format_with_types_use_header) is set to 1, +the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped. This format is also available under the name `TSVWithNamesAndTypes`. +## TabSeparatedRawWithNames {#tabseparatedrawwithnames} + +Differs from `TabSeparatedWithNames` format in that the rows are written without escaping. +When parsing with this format, tabs or linefeeds are not allowed in each field. + +This format is also available under the name `TSVRawWithNames`. + +## TabSeparatedWithNamesAndTypes {#tabseparatedrawwithnamesandtypes} + +Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping. +When parsing with this format, tabs or linefeeds are not allowed in each field. + +This format is also available under the name `TSVRawWithNamesAndNames`. + ## Template {#format-template} This format allows specifying a custom format string with placeholders for values with a specified escaping rule. @@ -195,7 +222,7 @@ where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as - `Raw` (without escaping, similarly to `TSVRaw`) - `None` (no escaping rule, see further) -If an escaping rule is omitted, then `None` will be used. `XML` and `Raw` are suitable only for output. +If an escaping rule is omitted, then `None` will be used. `XML` is suitable only for output. So, for the following format string: @@ -375,9 +402,8 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported. -Empty unquoted input values are replaced with default values for the respective columns, if -[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) -is enabled. +If setting [input_format_csv_empty_as_default](../operations/settings/settings.md#settings-input_format_csv_empty_as_default) is enabled, +empty unquoted input values are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too. `NULL` is formatted as `\N` or `NULL` or an empty unquoted string (see settings [input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) and [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)). @@ -385,7 +411,11 @@ The CSV format supports the output of totals and extremes the same way as `TabSe ## CSVWithNames {#csvwithnames} -Also prints the header row, similar to [TabSeparatedWithNames](#tabseparatedwithnames). +Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames). + +## CSVWithNamesAndTypes {#csvwithnamesandtypes} + +Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). ## CustomSeparated {#format-customseparated} @@ -657,10 +687,21 @@ Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yie {"progress":{"read_rows":"3","read_bytes":"24","written_rows":"0","written_bytes":"0","total_rows_to_read":"3"}} ``` +## JSONCompactEachRowWithNames {#jsoncompacteachrowwithnames} + +Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames). + ## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes} + +Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). + +## JSONCompactStringsEachRowWithNames {#jsoncompactstringseachrowwithnames} + +Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames). + ## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes} -Differs from `JSONCompactEachRow`/`JSONCompactStringsEachRow` in that the column names and types are written as the first two rows. +Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). ```json ["'hello'", "multiply(42, number)", "range(5)"] @@ -910,6 +951,13 @@ Array is represented as a varint length (unsigned [LEB128](https://en.wikipedia. For [NULL](../sql-reference/syntax.md#null-literal) support, an additional byte containing 1 or 0 is added before each [Nullable](../sql-reference/data-types/nullable.md) value. If 1, then the value is `NULL` and this byte is interpreted as a separate value. If 0, the value after the byte is not `NULL`. +## RowBinaryWithNames {#rowbinarywithnames} + +Similar to [RowBinary](#rowbinary), but with added header: + +- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N) +- N `String`s specifying column names + ## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes} Similar to [RowBinary](#rowbinary), but with added header: diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index dcd59438b33..098d73ed080 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -399,7 +399,7 @@ Default value: 1. ## input_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields} -When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv) and [TabSeparated](../../interfaces/formats.md#tabseparated) formats. +When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv), [TabSeparated](../../interfaces/formats.md#tabseparated) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. !!! note "Note" When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. @@ -417,6 +417,12 @@ When enabled, replace empty input fields in TSV with default values. For complex Disabled by default. +## input_format_csv_empty_as_default {#settings-input-format-csv-empty-as-default} + +When enabled, replace empty input fields in CSV with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. + +Enabled by default. + ## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number} Enables or disables parsing enum values as enum ids for TSV input format. @@ -540,8 +546,40 @@ To improve insert performance, we recommend disabling this check if you are sure Supported formats: -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [CSVWithNames](../../interfaces/formats.md#csvwithnamesandtypes) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNames](../../interfaces/formats.md#jsoncompacteachrowwithnames) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md#jsoncompactstringseachrowwithnames) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNames](../../interfaces/formats.md#rowbinarywithnames-rowbinarywithnames) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +## input_format_with_types_use_header {#settings-input-format-with-types-use-header} + +Controls whether format parser should check if data types from the input data match data types from the target table. + +Supported formats: + +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [CSVWithNames](../../interfaces/formats.md#csvwithnamesandtypes) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNames](../../interfaces/formats.md#jsoncompacteachrowwithnames) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md#jsoncompactstringseachrowwithnames) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNames](../../interfaces/formats.md#rowbinarywithnames-rowbinarywithnames) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) Possible values: diff --git a/docs/en/sql-reference/statements/create/database.md b/docs/en/sql-reference/statements/create/database.md index 3c6f73d54db..787bbc02346 100644 --- a/docs/en/sql-reference/statements/create/database.md +++ b/docs/en/sql-reference/statements/create/database.md @@ -8,7 +8,7 @@ toc_title: DATABASE Creates a new database. ``` sql -CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] +CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] [COMMENT 'Comment'] ``` ## Clauses {#clauses} @@ -26,4 +26,33 @@ ClickHouse creates the `db_name` database on all the servers of a specified clus ### ENGINE {#engine} -[MySQL](../../../engines/database-engines/mysql.md) allows you to retrieve data from the remote MySQL server. By default, ClickHouse uses its own [database engine](../../../engines/database-engines/index.md). There’s also a [lazy](../../../engines/database-engines/lazy.md) engine. +By default, ClickHouse uses its own [Atomic](../../../engines/database-engines/atomic.md) database engine. There are also [Lazy](../../../engines/database-engines/lazy.md), [MySQL](../../../engines/database-engines/mysql.md), [PostgresSQL](../../../engines/database-engines/postgresql.md), [MaterializedMySQL](../../../engines/database-engines/materialized-mysql.md), [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md), [Replicated](../../../engines/database-engines/replicated.md), [SQLite](../../../engines/database-engines/sqlite.md). + +### COMMENT {#comment} + +You can add a comment to the database when you creating it. + +The comment is supported for all database engines. + +**Syntax** + +``` sql +CREATE DATABASE db_name ENGINE = engine(...) COMMENT 'Comment' +``` + +**Example** + +Query: + +``` sql +CREATE DATABASE db_comment ENGINE = Memory COMMENT 'The temporary database'; +SELECT name, comment FROM system.databases WHERE name = 'db_comment'; +``` + +Result: + +```text +┌─name───────┬─comment────────────────┐ +│ db_comment │ The temporary database │ +└────────────┴────────────────────────┘ +``` diff --git a/docs/ru/sql-reference/statements/create/database.md b/docs/ru/sql-reference/statements/create/database.md index 7d19f3e8f17..b697f4caada 100644 --- a/docs/ru/sql-reference/statements/create/database.md +++ b/docs/ru/sql-reference/statements/create/database.md @@ -8,27 +8,51 @@ toc_title: "База данных" Создает базу данных. ``` sql -CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] +CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] [COMMENT 'Comment'] ``` -### Секции {#sektsii} +## Секции {#clauses} -- `IF NOT EXISTS` +### IF NOT EXISTS {#if-not-exists} - Если база данных с именем `db_name` уже существует, то ClickHouse не создаёт базу данных и: - - Не генерирует исключение, если секция указана. - - Генерирует исключение, если секция не указана. +Если база данных с именем `db_name` уже существует, то ClickHouse не создает базу данных и: -- `ON CLUSTER` +- Не генерирует исключение, если секция указана. +- Генерирует исключение, если секция не указана. - ClickHouse создаёт базу данных `db_name` на всех серверах указанного кластера. +### ON CLUSTER {#on-cluster} -- `ENGINE` +ClickHouse создаёт базу данных с именем `db_name` на всех серверах указанного кластера. Более подробную информацию смотрите в разделе [Распределенные DDL запросы](../../../sql-reference/distributed-ddl.md). - - MySQL +### ENGINE {#engine} - Позволяет получать данные с удаленного сервера MySQL. +По умолчанию ClickHouse использует собственный движок баз данных [Atomic](../../../engines/database-engines/atomic.md). Есть также движки баз данных [Lazy](../../../engines/database-engines/lazy.md), [MySQL](../../../engines/database-engines/mysql.md), [PostgresSQL](../../../engines/database-engines/postgresql.md), [MaterializedMySQL](../../../engines/database-engines/materialized-mysql.md), [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md), [Replicated](../../../engines/database-engines/replicated.md), [SQLite](../../../engines/database-engines/sqlite.md). - По умолчанию ClickHouse использует собственный движок баз данных. +### COMMENT {#comment} - +Вы можете добавить комментарий к базе данных при ее создании. + +Комментарий поддерживается для всех движков баз данных. + +**Синтаксис** + +``` sql +CREATE DATABASE db_name ENGINE = engine(...) COMMENT 'Comment' +``` + +**Пример** + +Запрос: + +``` sql +CREATE DATABASE db_comment ENGINE = Memory COMMENT 'The temporary database'; +SELECT name, comment FROM system.databases WHERE name = 'db_comment'; +``` + +Результат: + +```text +┌─name───────┬─comment────────────────┐ +│ db_comment │ The temporary database │ +└────────────┴────────────────────────┘ +``` diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 4951106f595..0c4141c74f2 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1007,9 +1007,6 @@ void Client::addOptions(OptionsDescription & options_description) ("max_client_network_bandwidth", po::value(), "the maximum speed of data exchange over the network for the client in bytes per second.") ("compression", po::value(), "enable or disable compression") - ("log-level", po::value(), "client log level") - ("server_logs_file", po::value(), "put server logs into specified file") - ("query-fuzzer-runs", po::value()->default_value(0), "After executing every SELECT query, do random mutations in it and run again specified number of times. This is used for testing to discover unexpected corner cases.") ("interleave-queries-file", po::value>()->multitoken(), "file path with queries to execute before every file from 'queries-file'; multiple files can be specified (--queries-file file1 file2...); this is needed to enable more aggressive fuzzing of newly added tests (see 'query-fuzzer-runs' option)") @@ -1125,8 +1122,6 @@ void Client::processOptions(const OptionsDescription & options_description, max_client_network_bandwidth = options["max_client_network_bandwidth"].as(); if (options.count("compression")) config().setBool("compression", options["compression"].as()); - if (options.count("server_logs_file")) - server_logs_file = options["server_logs_file"].as(); if (options.count("no-warnings")) config().setBool("no-warnings", true); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 9753e7f7c5d..dc7daf2846a 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -182,23 +183,6 @@ void LocalServer::initialize(Poco::Util::Application & self) auto loaded_config = config_processor.loadConfig(); config().add(loaded_config.configuration.duplicate(), PRIO_DEFAULT, false); } - - if (config().has("logger.console") || config().has("logger.level") || config().has("logger.log")) - { - // force enable logging - config().setString("logger", "logger"); - // sensitive data rules are not used here - buildLoggers(config(), logger(), "clickhouse-local"); - } - else - { - // Turn off server logging to stderr - if (!config().has("verbose")) - { - Poco::Logger::root().setLevel("none"); - Poco::Logger::root().setChannel(Poco::AutoPtr(new Poco::NullChannel())); - } - } } @@ -497,6 +481,35 @@ void LocalServer::processConfig() } print_stack_trace = config().getBool("stacktrace", false); + auto logging = (config().has("logger.console") + || config().has("logger.level") + || config().has("log-level") + || config().has("logger.log")); + + auto file_logging = config().has("server_logs_file"); + if (is_interactive && logging && !file_logging) + throw Exception("For interactive mode logging is allowed only with --server_logs_file option", + ErrorCodes::BAD_ARGUMENTS); + + if (file_logging) + { + auto level = Poco::Logger::parseLevel(config().getString("log-level", "trace")); + Poco::Logger::root().setLevel(level); + Poco::Logger::root().setChannel(Poco::AutoPtr(new Poco::SimpleFileChannel(server_logs_file))); + } + else if (logging) + { + // force enable logging + config().setString("logger", "logger"); + // sensitive data rules are not used here + buildLoggers(config(), logger(), "clickhouse-local"); + } + else + { + Poco::Logger::root().setLevel("none"); + Poco::Logger::root().setChannel(Poco::AutoPtr(new Poco::NullChannel())); + } + shared_context = Context::createShared(); global_context = Context::createGlobal(shared_context.get()); diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 6f2b178067f..ebc6c7d3107 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1612,9 +1612,13 @@ void ClientBase::init(int argc, char ** argv) ("disable_suggestion,A", "Disable loading suggestion data. Note that suggestion data is loaded asynchronously through a second connection to ClickHouse server. Also it is reasonable to disable suggestion if you want to paste a query with TAB characters. Shorthand option -A is for those who get used to mysql client.") ("time,t", "print query execution time to stderr in non-interactive mode (for benchmarks)") + ("echo", "in batch mode, print query before execution") ("verbose", "print query and other debugging info") + ("log-level", po::value(), "log level") + ("server_logs_file", po::value(), "put server logs into specified file") + ("multiline,m", "multiline") ("multiquery,n", "multiquery") @@ -1701,6 +1705,8 @@ void ClientBase::init(int argc, char ** argv) config().setBool("verbose", true); if (options.count("log-level")) Poco::Logger::root().setLevel(options["log-level"].as()); + if (options.count("server_logs_file")) + server_logs_file = options["server_logs_file"].as(); if (options.count("hardware-utilization")) progress_indication.print_hardware_utilization = true; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 3cd914a89bb..ec2bd025627 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -593,6 +593,7 @@ M(623, CAPN_PROTO_BAD_CAST) \ M(624, BAD_FILE_TYPE) \ M(625, IO_SETUP_ERROR) \ + M(626, CANNOT_SKIP_UNKNOWN_FIELD) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/FileChecker.cpp b/src/Common/FileChecker.cpp index 64c13ceb69c..4de5a92a1b8 100644 --- a/src/Common/FileChecker.cpp +++ b/src/Common/FileChecker.cpp @@ -38,6 +38,11 @@ void FileChecker::setPath(const String & file_info_path_) files_info_path = file_info_path_; } +String FileChecker::getPath() const +{ + return files_info_path; +} + void FileChecker::update(const String & full_file_path) { bool exists = disk->exists(full_file_path); diff --git a/src/Common/FileChecker.h b/src/Common/FileChecker.h index 325e9325267..a0ea449393e 100644 --- a/src/Common/FileChecker.h +++ b/src/Common/FileChecker.h @@ -13,7 +13,9 @@ class FileChecker { public: FileChecker(DiskPtr disk_, const String & file_info_path_); + void setPath(const String & file_info_path_); + String getPath() const; void update(const String & full_file_path); void setEmpty(const String & full_file_path); diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index cd1f7285355..1bebe5f5efc 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -588,6 +588,17 @@ DataTypes Block::getDataTypes() const return res; } +Names Block::getDataTypeNames() const +{ + Names res; + res.reserve(columns()); + + for (const auto & elem : data) + res.push_back(elem.type->getName()); + + return res; +} + bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs) { diff --git a/src/Core/Block.h b/src/Core/Block.h index 0f14426f772..cad29dea7e6 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -90,6 +90,7 @@ public: NamesAndTypesList getNamesAndTypesList() const; Names getNames() const; DataTypes getDataTypes() const; + Names getDataTypeNames() const; /// Returns number of rows from first column in block, not equal to nullptr. If no columns, returns 0. size_t rows() const; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e6f685a0650..b204fb19bf6 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -556,10 +556,12 @@ class IColumn; M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \ M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ - M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \ - M(Bool, input_format_with_names_use_header, true, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \ + M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \ + M(Bool, input_format_with_names_use_header, true, "For -WithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \ + M(Bool, input_format_with_types_use_header, true, "For -WithNamesAndTypes input formats this controls whether format parser should check if data types from the input match data types from the header.", 0) \ M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \ - M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).", IMPORTANT) \ + M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, -WithNames, -WithNamesAndTypes formats).", IMPORTANT) \ + M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \ M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \ M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \ diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index cfc14cadfb0..38b2109eec6 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -52,6 +52,7 @@ public: bool onlyNull() const override { return false; } bool lowCardinality() const override { return true; } bool supportsSparseSerialization() const override { return false; } + bool isLowCardinalityNullable() const override { return dictionary_type->isNullable(); } static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type); static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys); diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index a978fb79bfd..c88690ae02f 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -278,6 +278,9 @@ public: virtual bool lowCardinality() const { return false; } + /// Checks if this type is LowCardinality(Nullable(...)) + virtual bool isLowCardinalityNullable() const { return false; } + /// Strings, Numbers, Date, DateTime, Nullable virtual bool canBeInsideLowCardinality() const { return false; } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 13b57a3ca04..09cd7a5aee9 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -249,6 +250,20 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) return true; } +void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + /// Read until \t or \n. + readString(field, istr); + ReadBufferFromString buf(field); + deserializeWholeText(column, buf, settings); +} + +void ISerialization::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + size_t ISerialization::getArrayLevel(const SubstreamPath & path) { size_t level = 0; diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 0bc79a9ae94..48283f3e375 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -318,6 +318,14 @@ public: serializeText(column, row_num, ostr, settings); } + /** Text deserialization without escaping and quoting. Reads all data until first \n or \t + * into a temporary string and then call deserializeWholeText. It was implemented this way + * because this function is rarely used and because proper implementation requires a lot of + * additional code in data types serialization and ReadHelpers. + */ + virtual void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + virtual void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); static String getFileNameForStream(const String & name_in_storage, const SubstreamPath & path); static String getSubcolumnNameForStream(const SubstreamPath & path); diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp index 5c63631e2a3..972313a564f 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.cpp +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -163,7 +163,7 @@ void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffe void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringInto(data, istr); }); + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); }); } diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 3d6f3f386e0..3d0256b1ed9 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -784,6 +784,7 @@ void SerializationLowCardinality::serializeTextJSON(const IColumn & column, size { serializeImpl(column, row_num, &ISerialization::serializeTextJSON, ostr, settings); } + void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings); @@ -794,6 +795,16 @@ void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_ serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings); } +void SerializationLowCardinality::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &ISerialization::deserializeTextRaw, istr, settings); +} + +void SerializationLowCardinality::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &ISerialization::serializeTextRaw, ostr, settings); +} + template void SerializationLowCardinality::serializeImpl( const IColumn & column, size_t row_num, SerializationLowCardinality::SerializeFunctionPtr func, Args &&... args) const diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index f0f4fa1a4cc..5f8a2a95a25 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -63,6 +63,8 @@ public: void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; private: template diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index de26f573863..4ff24534692 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include namespace DB @@ -261,8 +260,36 @@ void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer deserializeTextEscapedImpl(column, istr, settings, nested); } +void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextRawImpl(column, istr, settings, nested); +} + +void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeString(settings.tsv.null_representation, ostr); + else + nested->serializeTextRaw(col.getNestedColumn(), row_num, ostr, settings); +} + +template +ReturnType SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) +{ + return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); +} + template ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) { /// Little tricky, because we cannot discriminate null from first character. @@ -272,7 +299,13 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R /// This is not null, surely. return safeDeserialize(column, *nested, [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); }); + [&nested, &istr, &settings] (IColumn & nested_column) + { + if constexpr (escaped) + nested->deserializeTextEscaped(nested_column, istr, settings); + else + nested->deserializeTextRaw(nested_column, istr, settings); + }); } else { @@ -298,7 +331,10 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R { /// We could step back to consume backslash again. --istr.position(); - nested->deserializeTextEscaped(nested_column, istr, settings); + if constexpr (escaped) + nested->deserializeTextEscaped(nested_column, istr, settings); + else + nested->deserializeTextRaw(nested_column, istr, settings); } else { @@ -306,7 +342,10 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R ReadBufferFromMemory prefix("\\", 1); ConcatReadBuffer prepended_istr(prefix, istr); - nested->deserializeTextEscaped(nested_column, prepended_istr, settings); + if constexpr (escaped) + nested->deserializeTextEscaped(nested_column, prepended_istr, settings); + else + nested->deserializeTextRaw(nested_column, prepended_istr, settings); /// Synchronise cursor position in original buffer. @@ -512,5 +551,6 @@ template bool SerializationNullable::deserializeTextEscapedImpl(IColumn & template bool SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); template bool SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); template bool SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index 10eb8a56d37..eb3e9bfb430 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -71,6 +71,9 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) /// If ReturnType is void, deserialize Nullable(T) template @@ -83,6 +86,10 @@ public: static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); template static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + template + static ReturnType deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); private: struct SubcolumnCreator : public ISubcolumnCreator diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index c3c24ed6749..89f7fe4ad9d 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -245,7 +245,7 @@ static inline void read(IColumn & column, Reader && reader) void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index d2dc18a03fd..eee5d662107 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -56,7 +56,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; - format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields; + format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.output_format_csv_null_representation; format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null; @@ -107,10 +107,12 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; format_settings.with_names_use_header = settings.input_format_with_names_use_header; + format_settings.with_types_use_header = settings.input_format_with_types_use_header; format_settings.write_statistics = settings.output_format_write_statistics; format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.orc.import_nested = settings.input_format_orc_import_nested; + format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index ee3824081bb..4e10aa4141a 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -68,7 +68,6 @@ public: size_t row)>; private: - using InputCreatorFunc = InputFormatPtr( ReadBuffer & buf, const Block & header, diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 403ccbc6763..cb1eedeaa5b 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -25,10 +25,12 @@ struct FormatSettings bool skip_unknown_fields = false; bool with_names_use_header = false; + bool with_types_use_header = false; bool write_statistics = true; bool import_nested_json = false; bool null_as_default = true; bool decimal_trailing_zeros = false; + bool defaults_for_omitted_fields = true; enum class DateTimeInputFormat { diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index 8ef05fa584e..b55e9f59cc7 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -1,5 +1,8 @@ #include #include +#include +#include + #include namespace DB @@ -10,7 +13,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) +template +static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows) { skipWhitespaceIfAny(in); @@ -19,7 +23,7 @@ std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D bool quotes = false; size_t number_of_rows = 0; - while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size)) + while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size || number_of_rows < min_rows)) { const auto current_object_size = memory.size() + static_cast(pos - in.position()); if (current_object_size > 10 * min_chunk_size) @@ -50,19 +54,19 @@ std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D } else { - pos = find_first_symbols<'{', '}', '\\', '"'>(pos, in.buffer().end()); + pos = find_first_symbols(pos, in.buffer().end()); if (pos > in.buffer().end()) throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); else if (pos == in.buffer().end()) continue; - else if (*pos == '{') + else if (*pos == opening_bracket) { ++balance; ++pos; } - else if (*pos == '}') + else if (*pos == closing_bracket) { --balance; ++pos; @@ -88,6 +92,16 @@ std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D return {loadAtPosition(in, memory, pos), number_of_rows}; } +std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) +{ + return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1); +} + +std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows) +{ + return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows); +} + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) { /// For JSONEachRow we can safely skip whitespace characters @@ -95,4 +109,37 @@ bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) return buf.eof() || *buf.position() == '['; } +bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings) +{ + try + { + bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); + + if (yield_strings) + { + String str; + readJSONString(str, in); + + ReadBufferFromString buf(str); + + if (as_nullable) + return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); + + serialization->deserializeWholeText(column, buf, format_settings); + return true; + } + + if (as_nullable) + return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); + + serialization->deserializeTextJSON(column, in, format_settings); + return true; + } + catch (Exception & e) + { + e.addMessage("(while reading the value of key " + column_name + ")"); + throw; + } +} + } diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h index 2d2d4ad5531..4a049aa1abd 100644 --- a/src/Formats/JSONEachRowUtils.h +++ b/src/Formats/JSONEachRowUtils.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -7,8 +8,11 @@ namespace DB { -std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); +std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); +std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows); bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); +bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings); + } diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index a95ccb8b064..1aedff5fceb 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -13,6 +13,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory); void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory); void registerFileSegmentationEngineRegexp(FormatFactory & factory); void registerFileSegmentationEngineJSONAsString(FormatFactory & factory); +void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory); /// Formats for both input/output. @@ -87,6 +88,7 @@ void registerFormats() registerFileSegmentationEngineJSONEachRow(factory); registerFileSegmentationEngineRegexp(factory); registerFileSegmentationEngineJSONAsString(factory); + registerFileSegmentationEngineJSONCompactEachRow(factory); registerInputFormatNative(factory); registerOutputFormatNative(factory); diff --git a/src/Formats/registerWithNamesAndTypes.cpp b/src/Formats/registerWithNamesAndTypes.cpp new file mode 100644 index 00000000000..cba578b08c7 --- /dev/null +++ b/src/Formats/registerWithNamesAndTypes.cpp @@ -0,0 +1,13 @@ +#include + +namespace DB +{ + +void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func) +{ + register_func(base_format_name, false, false); + register_func(base_format_name + "WithNames", true, false); + register_func(base_format_name + "WithNamesAndTypes", true, true); +} + +} diff --git a/src/Formats/registerWithNamesAndTypes.h b/src/Formats/registerWithNamesAndTypes.h new file mode 100644 index 00000000000..d8e74e3421e --- /dev/null +++ b/src/Formats/registerWithNamesAndTypes.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include + +namespace DB +{ + +using RegisterWithNamesAndTypesFunc = std::function; +void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func); + +} diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index 8f5992efb1b..99d25d87b73 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -16,16 +16,11 @@ struct ColumnMapping using OptionalIndexes = std::vector>; OptionalIndexes column_indexes_for_input_fields; - /// Tracks which columns we have read in a single read() call. - /// For columns that are never read, it is initialized to false when we - /// read the file header, and never changed afterwards. - /// For other columns, it is updated on each read() call. - std::vector read_columns; + /// The list of column indexes that are not presented in input data. + std::vector not_presented_columns; - - /// Whether we have any columns that are not read from file at all, - /// and must be always initialized with defaults. - bool have_always_default_columns{false}; + /// The list of column names in input data. Needed for better exception messages. + std::vector names_of_columns; }; using ColumnMappingPtr = std::shared_ptr; diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index c122b9eea1a..0506c539c0f 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -2,79 +2,96 @@ #include #include #include +#include +#include namespace DB { -BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_) - : IRowInputFormat(std::move(header), in_, params_), with_names(with_names_), with_types(with_types_) +namespace ErrorCodes +{ + extern const int CANNOT_SKIP_UNKNOWN_FIELD; +} + +BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes(std::move(header), in_, std::move(params_), with_names_, with_types_, format_settings_) { } - -bool BinaryRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +std::vector BinaryRowInputFormat::readHeaderRow() { - if (in->eof()) - return false; + std::vector fields; + String field; + for (size_t i = 0; i < read_columns; ++i) + { + readStringBinary(field, *in); + fields.push_back(field); + } + return fields; +} - size_t num_columns = columns.size(); - for (size_t i = 0; i < num_columns; ++i) - serializations[i]->deserializeBinary(*columns[i], *in); +std::vector BinaryRowInputFormat::readNames() +{ + readVarUInt(read_columns, *in); + return readHeaderRow(); +} +std::vector BinaryRowInputFormat::readTypes() +{ + auto types = readHeaderRow(); + for (const auto & type_name : types) + read_data_types.push_back(DataTypeFactory::instance().get(type_name)); + return types; +} + +bool BinaryRowInputFormat::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) +{ + serialization->deserializeBinary(column, *in); return true; } - -void BinaryRowInputFormat::readPrefix() +void BinaryRowInputFormat::skipHeaderRow() { - /// NOTE: The header is completely ignored. This can be easily improved. - - UInt64 columns = 0; String tmp; - - if (with_names || with_types) - { - readVarUInt(columns, *in); - } - - if (with_names) - { - for (size_t i = 0; i < columns; ++i) - { - readStringBinary(tmp, *in); - } - } - - if (with_types) - { - for (size_t i = 0; i < columns; ++i) - { - readStringBinary(tmp, *in); - } - } + for (size_t i = 0; i < read_columns; ++i) + readStringBinary(tmp, *in); } +void BinaryRowInputFormat::skipNames() +{ + readVarUInt(read_columns, *in); + skipHeaderRow(); +} + +void BinaryRowInputFormat::skipTypes() +{ + skipHeaderRow(); +} + +void BinaryRowInputFormat::skipField(size_t file_column) +{ + if (file_column >= read_data_types.size()) + throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD, "Cannot skip unknown field in RowBinaryWithNames format, because it's type is unknown"); + Field field; + read_data_types[file_column]->getDefaultSerialization()->deserializeBinary(field, *in); +} void registerInputFormatRowBinary(FormatFactory & factory) { - factory.registerInputFormat("RowBinary", []( - ReadBuffer & buf, - const Block & sample, - const IRowInputFormat::Params & params, - const FormatSettings &) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - return std::make_shared(buf, sample, params, false, false); - }); + factory.registerInputFormat(format_name, [with_names, with_types]( + ReadBuffer & buf, + const Block & sample, + const IRowInputFormat::Params & params, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, params, with_names, with_types, settings); + }); + }; - factory.registerInputFormat("RowBinaryWithNamesAndTypes", []( - ReadBuffer & buf, - const Block & sample, - const IRowInputFormat::Params & params, - const FormatSettings &) - { - return std::make_shared(buf, sample, params, true, true); - }); + registerWithNamesAndTypes("RowBinary", register_func); } } diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.h b/src/Processors/Formats/Impl/BinaryRowInputFormat.h index e96a516c1a7..61d6df77522 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.h +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB @@ -12,19 +13,33 @@ class ReadBuffer; /** A stream for inputting data in a binary line-by-line format. */ -class BinaryRowInputFormat : public IRowInputFormat +class BinaryRowInputFormat : public RowInputFormatWithNamesAndTypes { public: - BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_); - - bool readRow(MutableColumns & columns, RowReadExtension &) override; - void readPrefix() override; + BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_); String getName() const override { return "BinaryRowInputFormat"; } + /// RowInputFormatWithNamesAndTypes implements logic with DiagnosticInfo, but + /// in this format we cannot provide any DiagnosticInfo, because here we have + /// just binary data. + std::string getDiagnosticInfo() override { return {}; } + private: - bool with_names; - bool with_types; + bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; + void skipField(size_t file_column) override; + + void skipNames() override; + void skipTypes() override; + void skipHeaderRow(); + + std::vector readNames() override; + std::vector readTypes() override; + std::vector readHeaderRow(); + + /// Data types read from input data. + DataTypes read_data_types; + UInt64 read_columns = 0; }; } diff --git a/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp index dbaee68453e..02c4aee5e4e 100644 --- a/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowOutputFormat.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -49,23 +50,19 @@ void BinaryRowOutputFormat::writeField(const IColumn & column, const ISerializat void registerOutputFormatRowBinary(FormatFactory & factory) { - factory.registerOutputFormat("RowBinary", []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings &) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - return std::make_shared(buf, sample, false, false, params); - }); + factory.registerOutputFormat(format_name, [with_names, with_types]( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings &) + { + return std::make_shared(buf, sample, with_names, with_types, params); + }); + }; - factory.registerOutputFormat("RowBinaryWithNamesAndTypes", []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings &) - { - return std::make_shared(buf, sample, true, true, params); - }); + registerWithNamesAndTypes("RowBinary", register_func); } } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 4beb260b64a..9de2b908b1e 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -3,12 +3,12 @@ #include #include -#include +#include #include +#include #include #include - namespace DB { @@ -19,62 +19,21 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - -CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_) - , with_names(with_names_) - , format_settings(format_settings_) +CSVRowInputFormat::CSVRowInputFormat( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_) { - const String bad_delimiters = " \t\"'.UL"; if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos) - throw Exception(String("CSV format may not work correctly with delimiter '") + format_settings.csv.delimiter + - "'. Try use CustomSeparated format instead.", ErrorCodes::BAD_ARGUMENTS); - - const auto & sample = getPort().getHeader(); - size_t num_columns = sample.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = sample.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - } -} - - -/// Map an input file column to a table column, based on its name. -void CSVRowInputFormat::addInputColumn(const String & column_name) -{ - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_mapping->column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - throw Exception( - "Unknown field found in CSV header: '" + column_name + "' " + - "at position " + std::to_string(column_mapping->column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (column_mapping->read_columns[column_index]) - throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA); - - column_mapping->read_columns[column_index] = true; - column_mapping->column_indexes_for_input_fields.emplace_back(column_index); + String("CSV format may not work correctly with delimiter '") + format_settings.csv.delimiter + + "'. Try use CustomSeparated format instead.", + ErrorCodes::BAD_ARGUMENTS); } static void skipEndOfLine(ReadBuffer & in) @@ -100,29 +59,6 @@ static void skipEndOfLine(ReadBuffer & in) throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); } - -static void skipDelimiter(ReadBuffer & in, const char delimiter, bool is_last_column) -{ - if (is_last_column) - { - if (in.eof()) - return; - - /// we support the extra delimiter at the end of the line - if (*in.position() == delimiter) - { - ++in.position(); - if (in.eof()) - return; - } - - skipEndOfLine(in); - } - else - assertChar(delimiter, in); -} - - /// Skip `whitespace` symbols allowed in CSV. static inline void skipWhitespacesAndTabs(ReadBuffer & in) { @@ -132,255 +68,138 @@ static inline void skipWhitespacesAndTabs(ReadBuffer & in) ++in.position(); } - -static void skipRow(ReadBuffer & in, const FormatSettings::CSV & settings, size_t num_columns) +void CSVRowInputFormat::skipFieldDelimiter() { - String tmp; - for (size_t i = 0; i < num_columns; ++i) - { - skipWhitespacesAndTabs(in); - readCSVString(tmp, in, settings); - skipWhitespacesAndTabs(in); - - skipDelimiter(in, settings.delimiter, i + 1 == num_columns); - } + skipWhitespacesAndTabs(*in); + assertChar(format_settings.csv.delimiter, *in); } -void CSVRowInputFormat::setupAllColumnsByTableSchema() +String CSVRowInputFormat::readFieldIntoString() { - const auto & header = getPort().getHeader(); - column_mapping->read_columns.assign(header.columns(), true); - column_mapping->column_indexes_for_input_fields.resize(header.columns()); - - for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i) - column_mapping->column_indexes_for_input_fields[i] = i; + skipWhitespacesAndTabs(*in); + String field; + readCSVString(field, *in, format_settings.csv); + return field; } - -void CSVRowInputFormat::readPrefix() +void CSVRowInputFormat::skipField() { - /// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes, - /// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it. - skipBOMIfExists(*in); - - size_t num_columns = data_types.size(); - const auto & header = getPort().getHeader(); - - /// This is a bit of abstraction leakage, but we have almost the same code in other places. - /// Thus, we check if this InputFormat is working with the "real" beginning of the data in case of parallel parsing. - if (with_names && getCurrentUnitNumber() == 0) - { - /// This CSV file has a header row with column names. Depending on the - /// settings, use it or skip it. - if (format_settings.with_names_use_header) - { - /// Look at the file header to see which columns we have there. - /// The missing columns are filled with defaults. - column_mapping->read_columns.assign(header.columns(), false); - do - { - String column_name; - skipWhitespacesAndTabs(*in); - readCSVString(column_name, *in, format_settings.csv); - skipWhitespacesAndTabs(*in); - - addInputColumn(column_name); - } - while (checkChar(format_settings.csv.delimiter, *in)); - - skipDelimiter(*in, format_settings.csv.delimiter, true); - - for (auto read_column : column_mapping->read_columns) - { - if (!read_column) - { - column_mapping->have_always_default_columns = true; - break; - } - } - - return; - } - else - { - skipRow(*in, format_settings.csv, num_columns); - setupAllColumnsByTableSchema(); - } - } - else if (!column_mapping->is_set) - setupAllColumnsByTableSchema(); + readFieldIntoString(); } - -bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext) +void CSVRowInputFormat::skipRowEndDelimiter() { + skipWhitespacesAndTabs(*in); + if (in->eof()) - return false; + return; - updateDiagnosticInfo(); + /// we support the extra delimiter at the end of the line + if (*in->position() == format_settings.csv.delimiter) + ++in->position(); - /// Track whether we have to fill any columns in this row with default - /// values. If not, we return an empty column mask to the caller, so that - /// it doesn't have to check it. - bool have_default_columns = column_mapping->have_always_default_columns; + skipWhitespacesAndTabs(*in); + if (in->eof()) + return; - ext.read_columns.assign(column_mapping->read_columns.size(), true); - const auto delimiter = format_settings.csv.delimiter; - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) - { - const auto & table_column = column_mapping->column_indexes_for_input_fields[file_column]; - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - - if (table_column) - { - skipWhitespacesAndTabs(*in); - ext.read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column], - serializations[*table_column], is_last_file_column); - - if (!ext.read_columns[*table_column]) - have_default_columns = true; - skipWhitespacesAndTabs(*in); - } - else - { - /// We never read this column from the file, just skip it. - String tmp; - readCSVString(tmp, *in, format_settings.csv); - } - - skipDelimiter(*in, delimiter, is_last_file_column); - } - - if (have_default_columns) - { - for (size_t i = 0; i < column_mapping->read_columns.size(); i++) - { - if (!column_mapping->read_columns[i]) - { - /// The column value for this row is going to be overwritten - /// with default by the caller, but the general assumption is - /// that the column size increases for each row, so we have - /// to insert something. Since we do not care about the exact - /// value, we do not have to use the default value specified by - /// the data type, and can just use IColumn::insertDefault(). - columns[i]->insertDefault(); - ext.read_columns[i] = false; - } - } - } - - return true; + skipEndOfLine(*in); } -bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) +void CSVRowInputFormat::skipHeaderRow() +{ + do + { + skipField(); + skipWhitespacesAndTabs(*in); + } + while (checkChar(format_settings.csv.delimiter, *in)); + + skipRowEndDelimiter(); +} + +std::vector CSVRowInputFormat::readHeaderRow() +{ + std::vector fields; + do + { + fields.push_back(readFieldIntoString()); + skipWhitespacesAndTabs(*in); + } + while (checkChar(format_settings.csv.delimiter, *in)); + + skipRowEndDelimiter(); + return fields; +} + +bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + try { - if (file_column == 0 && in->eof()) - { - out << "\n"; - return false; - } - skipWhitespacesAndTabs(*in); - if (column_mapping->column_indexes_for_input_fields[file_column].has_value()) + assertChar(delimiter, *in); + } + catch (const DB::Exception &) + { + if (*in->position() == '\n' || *in->position() == '\r') { - const auto & header = getPort().getHeader(); - size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value(); - if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], - out, file_column)) - return false; + out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." + " It's like your file has less columns than expected.\n" + "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; } else { - static const String skipped_column_str = ""; - static const DataTypePtr skipped_column_type = std::make_shared(); - static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); - if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) - return false; - } - skipWhitespacesAndTabs(*in); - - /// Delimiters - if (file_column + 1 == column_mapping->column_indexes_for_input_fields.size()) - { - if (in->eof()) - return false; - - /// we support the extra delimiter at the end of the line - if (*in->position() == delimiter) - { - ++in->position(); - if (in->eof()) - break; - } - - if (!in->eof() && *in->position() != '\n' && *in->position() != '\r') - { - out << "ERROR: There is no line feed. "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n" - " It's like your file has more columns than expected.\n" - "And if your file has the right number of columns, maybe it has an unquoted string value with a comma.\n"; - - return false; - } - - skipEndOfLine(*in); - } - else - { - try - { - assertChar(delimiter, *in); - } - catch (const DB::Exception &) - { - if (*in->position() == '\n' || *in->position() == '\r') - { - out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." - " It's like your file has less columns than expected.\n" - "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; - } - else - { - out << "ERROR: There is no delimiter (" << delimiter << "). "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n"; - } - return false; - } + out << "ERROR: There is no delimiter (" << delimiter << "). "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; } + return false; } return true; } +bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +{ + skipWhitespacesAndTabs(*in); + + if (in->eof()) + return true; + + /// we support the extra delimiter at the end of the line + if (*in->position() == format_settings.csv.delimiter) + { + ++in->position(); + skipWhitespacesAndTabs(*in); + if (in->eof()) + return true; + } + + if (!in->eof() && *in->position() != '\n' && *in->position() != '\r') + { + out << "ERROR: There is no line feed. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n" + " It's like your file has more columns than expected.\n" + "And if your file has the right number of columns, maybe it has an unquoted string value with a comma.\n"; + + return false; + } + + skipEndOfLine(*in); + return true; +} void CSVRowInputFormat::syncAfterError() { skipToNextLineOrEOF(*in); } -void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) +bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { - const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; - if (index) - { - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column); - } - else - { - String tmp; - readCSVString(tmp, *in, format_settings.csv); - } -} + skipWhitespacesAndTabs(*in); -bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column) -{ const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); @@ -399,7 +218,7 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co column.insertDefault(); return false; } - else if (format_settings.null_as_default && !type->isNullable()) + else if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable()) { /// If value is null but type is not nullable then use default value instead. return SerializationNullable::deserializeTextCSVImpl(column, *in, format_settings, serialization); @@ -412,31 +231,24 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co } } -void CSVRowInputFormat::resetParser() -{ - RowInputFormatWithDiagnosticInfo::resetParser(); - column_mapping->column_indexes_for_input_fields.clear(); - column_mapping->read_columns.clear(); - column_mapping->have_always_default_columns = false; -} - - void registerInputFormatCSV(FormatFactory & factory) { - for (bool with_names : {false, true}) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerInputFormat(with_names ? "CSVWithNames" : "CSV", [=]( + factory.registerInputFormat(format_name, [with_names, with_types]( ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(sample, buf, params, with_names, settings); + return std::make_shared(sample, buf, std::move(params), with_names, with_types, settings); }); - } + }; + + registerWithNamesAndTypes("CSV", register_func); } -static std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) +static std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows) { char * pos = in.position(); bool quotes = false; @@ -476,7 +288,7 @@ static std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB else if (*pos == '\n') { ++number_of_rows; - if (memory.size() + static_cast(pos - in.position()) >= min_chunk_size) + if (memory.size() + static_cast(pos - in.position()) >= min_chunk_size && number_of_rows >= min_rows) need_more_data = false; ++pos; if (loadAtPosition(in, memory, pos) && *pos == '\r') @@ -484,7 +296,7 @@ static std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB } else if (*pos == '\r') { - if (memory.size() + static_cast(pos - in.position()) >= min_chunk_size) + if (memory.size() + static_cast(pos - in.position()) >= min_chunk_size && number_of_rows >= min_rows) need_more_data = false; ++pos; if (loadAtPosition(in, memory, pos) && *pos == '\n') @@ -502,8 +314,16 @@ static std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB void registerFileSegmentationEngineCSV(FormatFactory & factory) { - factory.registerFileSegmentationEngine("CSV", &fileSegmentationEngineCSVImpl); - factory.registerFileSegmentationEngine("CSVWithNames", &fileSegmentationEngineCSVImpl); + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + size_t min_rows = 1 + int(with_names) + int(with_types); + factory.registerFileSegmentationEngine(format_name, [min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) + { + return fileSegmentationEngineCSVImpl(in, memory, min_chunk_size, min_rows); + }); + }; + + registerWithNamesAndTypes("CSV", register_func); } } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index b6075745b39..f239464485a 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include @@ -14,41 +14,44 @@ namespace DB /** A stream for inputting data in csv format. * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. */ -class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo +class CSVRowInputFormat : public RowInputFormatWithNamesAndTypes { public: /** with_names - in the first line the header with column names + * with_types - on the next line header with type names */ CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, const FormatSettings & format_settings_); + bool with_names_, bool with_types_, const FormatSettings & format_settings_); String getName() const override { return "CSVRowInputFormat"; } - bool readRow(MutableColumns & columns, RowReadExtension & ext) override; - void readPrefix() override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - void resetParser() override; private: - /// There fields are computed in constructor. - bool with_names; - const FormatSettings format_settings; - DataTypes data_types; - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; - - void addInputColumn(const String & column_name); - - void setupAllColumnsByTableSchema(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; + bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; + bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t'; } - bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column); + bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; + + void skipField(size_t /*file_column*/) override { skipField(); } + void skipField(); + + void skipHeaderRow() ; + void skipNames() override { skipHeaderRow(); } + void skipTypes() override { skipHeaderRow(); } + void skipFieldDelimiter() override; + void skipRowEndDelimiter() override; + + std::vector readHeaderRow(); + std::vector readNames() override { return readHeaderRow(); } + std::vector readTypes() override { return readHeaderRow(); } + + String readFieldIntoString(); }; } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp index 14d0e519c0c..b300928e569 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -8,8 +9,8 @@ namespace DB { -CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_, params_), with_names(with_names_), format_settings(format_settings_) +CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, bool with_types_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) + : IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) { const auto & sample = getPort(PortKind::Main).getHeader(); size_t columns = sample.columns(); @@ -18,25 +19,27 @@ CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_ data_types[i] = sample.safeGetByPosition(i).type; } +void CSVRowOutputFormat::writeLine(const std::vector & values) +{ + for (size_t i = 0; i < values.size(); ++i) + { + writeCSVString(values[i], out); + if (i + 1 == values.size()) + writeRowEndDelimiter(); + else + writeFieldDelimiter(); + } +} void CSVRowOutputFormat::doWritePrefix() { const auto & sample = getPort(PortKind::Main).getHeader(); - size_t columns = sample.columns(); if (with_names) - { - for (size_t i = 0; i < columns; ++i) - { - writeCSVString(sample.safeGetByPosition(i).name, out); + writeLine(sample.getNames()); - char delimiter = format_settings.csv.delimiter; - if (i + 1 == columns) - delimiter = '\n'; - - writeChar(delimiter, out); - } - } + if (with_types) + writeLine(sample.getDataTypeNames()); } @@ -72,18 +75,20 @@ void CSVRowOutputFormat::writeBeforeExtremes() void registerOutputFormatCSV(FormatFactory & factory) { - for (bool with_names : {false, true}) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerOutputFormat(with_names ? "CSVWithNames" : "CSV", [=]( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & format_settings) + factory.registerOutputFormat(format_name, [with_names, with_types]( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings & format_settings) { - return std::make_shared(buf, sample, with_names, params, format_settings); + return std::make_shared(buf, sample, with_names, with_types, params, format_settings); }); - factory.markOutputFormatSupportsParallelFormatting(with_names ? "CSVWithNames" : "CSV"); - } + factory.markOutputFormatSupportsParallelFormatting(format_name); + }; + + registerWithNamesAndTypes("CSV", register_func); } } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.h b/src/Processors/Formats/Impl/CSVRowOutputFormat.h index 780a6c4d3ce..7f5d90203ea 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.h @@ -20,7 +20,7 @@ public: /** with_names - output in the first line a header with column names * with_types - output in the next line header with the names of the types */ - CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_); + CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, bool with_types, const RowOutputFormatParams & params_, const FormatSettings & format_settings_); String getName() const override { return "CSVRowOutputFormat"; } @@ -38,9 +38,11 @@ public: return String("text/csv; charset=UTF-8; header=") + (with_names ? "present" : "absent"); } -protected: +private: + void writeLine(const std::vector & values); bool with_names; + bool with_types; const FormatSettings format_settings; DataTypes data_types; }; diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 5b32bf94c4d..a5e0ac6862c 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -185,7 +185,7 @@ void registerInputFormatJSONAsString(FormatFactory & factory) void registerFileSegmentationEngineJSONAsString(FormatFactory & factory) { - factory.registerFileSegmentationEngine("JSONAsString", &fileSegmentationEngineJSONEachRowImpl); + factory.registerFileSegmentationEngine("JSONAsString", &fileSegmentationEngineJSONEachRow); } void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 962e9d6e5ac..88fb411ffbd 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -1,8 +1,11 @@ -#include -#include - #include + +#include +#include #include +#include +#include +#include #include #include @@ -12,183 +15,39 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; - extern const int CANNOT_READ_ALL_DATA; } -JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(ReadBuffer & in_, - const Block & header_, - Params params_, - const FormatSettings & format_settings_, - bool with_names_, - bool yield_strings_) - : IRowInputFormat(header_, in_, std::move(params_)), format_settings(format_settings_), with_names(with_names_), yield_strings(yield_strings_) +JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( + const Block & header_, + ReadBuffer & in_, + Params params_, + bool with_names_, + bool with_types_, + bool yield_strings_, + const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_) + , yield_strings(yield_strings_) { - const auto & sample = getPort().getHeader(); - size_t num_columns = sample.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = sample.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - } } -void JSONCompactEachRowRowInputFormat::resetParser() +void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter() { - IRowInputFormat::resetParser(); - column_indexes_for_input_fields.clear(); - not_seen_columns.clear(); -} - -void JSONCompactEachRowRowInputFormat::readPrefix() -{ - /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. - skipBOMIfExists(*in); - - if (with_names) - { - size_t num_columns = getPort().getHeader().columns(); - read_columns.assign(num_columns, false); - - assertChar('[', *in); - do - { - skipWhitespaceIfAny(*in); - String column_name; - readJSONString(column_name, *in); - addInputColumn(column_name); - skipWhitespaceIfAny(*in); - } - while (checkChar(',', *in)); - assertChar(']', *in); - skipEndOfLine(); - - /// Type checking - assertChar('[', *in); - for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i) - { - skipWhitespaceIfAny(*in); - String data_type; - readJSONString(data_type, *in); - - if (column_indexes_for_input_fields[i] && - data_types[*column_indexes_for_input_fields[i]]->getName() != data_type) - { - throw Exception( - "Type of '" + getPort().getHeader().getByPosition(*column_indexes_for_input_fields[i]).name - + "' must be " + data_types[*column_indexes_for_input_fields[i]]->getName() + - ", not " + data_type, - ErrorCodes::INCORRECT_DATA - ); - } - - if (i != column_indexes_for_input_fields.size() - 1) - assertChar(',', *in); - skipWhitespaceIfAny(*in); - } - assertChar(']', *in); - } - else - { - size_t num_columns = getPort().getHeader().columns(); - read_columns.assign(num_columns, true); - column_indexes_for_input_fields.resize(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - column_indexes_for_input_fields[i] = i; - } - } - - for (size_t i = 0; i < read_columns.size(); ++i) - { - if (!read_columns[i]) - { - not_seen_columns.emplace_back(i); - } - } -} - -void JSONCompactEachRowRowInputFormat::addInputColumn(const String & column_name) -{ - names_of_columns.emplace_back(column_name); - - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - - throw Exception( - "Unknown field found in JSONCompactEachRow header: '" + column_name + "' " + - "at position " + std::to_string(column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (read_columns[column_index]) - throw Exception("Duplicate field found while parsing JSONCompactEachRow header: " + column_name, ErrorCodes::INCORRECT_DATA); - - read_columns[column_index] = true; - column_indexes_for_input_fields.emplace_back(column_index); -} - -bool JSONCompactEachRowRowInputFormat::readRow(DB::MutableColumns &columns, DB::RowReadExtension &ext) -{ - skipEndOfLine(); - - if (in->eof()) - return false; - - size_t num_columns = columns.size(); - - read_columns.assign(num_columns, false); - + skipWhitespaceIfAny(*in); assertChar('[', *in); - for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) - { - const auto & table_column = column_indexes_for_input_fields[file_column]; - if (table_column) - { - readField(*table_column, columns); - } - else - { - skipJSONField(*in, StringRef(names_of_columns[file_column])); - } +} - skipWhitespaceIfAny(*in); - if (in->eof()) - throw ParsingException("Unexpected end of stream while parsing JSONCompactEachRow format", ErrorCodes::CANNOT_READ_ALL_DATA); - if (file_column + 1 != column_indexes_for_input_fields.size()) - { - assertChar(',', *in); - skipWhitespaceIfAny(*in); - } - } +void JSONCompactEachRowRowInputFormat::skipFieldDelimiter() +{ + skipWhitespaceIfAny(*in); + assertChar(',', *in); +} + +void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() +{ + skipWhitespaceIfAny(*in); assertChar(']', *in); - for (const auto & name : not_seen_columns) - columns[name]->insertDefault(); - - ext.read_columns = read_columns; - return true; -} - -void JSONCompactEachRowRowInputFormat::skipEndOfLine() -{ skipWhitespaceIfAny(*in); if (!in->eof() && (*in->position() == ',' || *in->position() == ';')) ++in->position(); @@ -196,39 +55,55 @@ void JSONCompactEachRowRowInputFormat::skipEndOfLine() skipWhitespaceIfAny(*in); } -void JSONCompactEachRowRowInputFormat::readField(size_t index, MutableColumns & columns) +String JSONCompactEachRowRowInputFormat::readFieldIntoString() { - try + skipWhitespaceIfAny(*in); + String field; + readJSONString(field, *in); + return field; +} + +void JSONCompactEachRowRowInputFormat::skipField(size_t file_column) +{ + skipWhitespaceIfAny(*in); + skipJSONField(*in, column_mapping->names_of_columns[file_column]); +} + +void JSONCompactEachRowRowInputFormat::skipHeaderRow() +{ + skipRowStartDelimiter(); + size_t i = 0; + do { - read_columns[index] = true; - const auto & type = data_types[index]; - const auto & serialization = serializations[index]; - - if (yield_strings) - { - String str; - readJSONString(str, *in); - - ReadBufferFromString buf(str); - - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization); - else - serialization->deserializeWholeText(*columns[index], buf, format_settings); - } - else - { - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], *in, format_settings, serialization); - else - serialization->deserializeTextJSON(*columns[index], *in, format_settings); - } + if (i >= column_mapping->names_of_columns.size()) + throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names"); + skipField(i++); + skipWhitespaceIfAny(*in); } - catch (Exception & e) + while (checkChar(',', *in)); + + skipRowEndDelimiter(); +} + +std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() +{ + skipRowStartDelimiter(); + std::vector fields; + do { - e.addMessage("(while reading the value of key " + getPort().getHeader().getByPosition(index).name + ")"); - throw; + fields.push_back(readFieldIntoString()); + skipWhitespaceIfAny(*in); } + while (checkChar(',', *in)); + + skipRowEndDelimiter(); + return fields; +} + +bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) +{ + skipWhitespaceIfAny(*in); + return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings); } void JSONCompactEachRowRowInputFormat::syncAfterError() @@ -236,43 +111,112 @@ void JSONCompactEachRowRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } +bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +{ + skipWhitespaceIfAny(*in); + if (!checkChar('[', *in)) + { + out << "ERROR: There is no '[' before the row.\n"; + return false; + } + + return true; +} + +bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +{ + try + { + skipWhitespaceIfAny(*in); + assertChar(',', *in); + } + catch (const DB::Exception &) + { + if (*in->position() == ']') + { + out << "ERROR: Closing parenthesis (']') found where comma is expected." + " It's like your file has less columns than expected.\n" + "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; + } + else + { + out << "ERROR: There is no comma. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; + } + return false; + } + + return true; +} + +bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +{ + skipWhitespaceIfAny(*in); + + if (in->eof()) + { + out << "ERROR: Unexpected end of file. ']' expected at the end of row."; + return false; + } + + if (!checkChar(']', *in)) + { + out << "ERROR: There is no closing parenthesis (']') at the end of the row. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; + return false; + } + + skipWhitespaceIfAny(*in); + + if (in->eof()) + return true; + + if ((*in->position() == ',' || *in->position() == ';')) + ++in->position(); + + skipWhitespaceIfAny(*in); + return true; +} + void registerInputFormatJSONCompactEachRow(FormatFactory & factory) { - factory.registerInputFormat("JSONCompactEachRow", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) + for (bool yield_strings : {true, false}) { - return std::make_shared(buf, sample, std::move(params), settings, false, false); - }); + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerInputFormat(format_name, [with_names, with_types, yield_strings]( + ReadBuffer & buf, + const Block & sample, + IRowInputFormat::Params params, + const FormatSettings & settings) + { + return std::make_shared(sample, buf, std::move(params), with_names, with_types, yield_strings, settings); + }); + }; - factory.registerInputFormat("JSONCompactEachRowWithNamesAndTypes", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, std::move(params), settings, true, false); - }); + registerWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } +} - factory.registerInputFormat("JSONCompactStringsEachRow", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) +void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory) +{ + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - return std::make_shared(buf, sample, std::move(params), settings, false, true); - }); + /// In case when we have names and/or types in the first two/one rows, + /// we need to read at least one more row of actual data. So, set + /// the minimum of rows for segmentation engine according to + /// parameters with_names and with_types. + size_t min_rows = 1 + int(with_names) + int(with_types); + factory.registerFileSegmentationEngine(format_name, [min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) + { + return fileSegmentationEngineJSONCompactEachRow(in, memory, min_chunk_size, min_rows); + }); + }; - factory.registerInputFormat("JSONCompactStringsEachRowWithNamesAndTypes", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, std::move(params), settings, true, true); - }); + registerWithNamesAndTypes("JSONCompactEachRow", register_func); + registerWithNamesAndTypes("JSONCompactStringsEachRow", register_func); } } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index 4077eb6e008..373eb04f06c 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include @@ -17,49 +17,47 @@ class ReadBuffer; * - JSONCompactStringsEachRowWithNamesAndTypes * */ -class JSONCompactEachRowRowInputFormat : public IRowInputFormat +class JSONCompactEachRowRowInputFormat : public RowInputFormatWithNamesAndTypes { public: JSONCompactEachRowRowInputFormat( - ReadBuffer & in_, const Block & header_, + ReadBuffer & in_, Params params_, - const FormatSettings & format_settings_, bool with_names_, - bool yield_strings_); + bool with_types_, + bool yield_strings_, + const FormatSettings & format_settings_); String getName() const override { return "JSONCompactEachRowRowInputFormat"; } - - void readPrefix() override; - bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - void resetParser() override; private: - void addInputColumn(const String & column_name); - void skipEndOfLine(); - void readField(size_t index, MutableColumns & columns); + bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; + bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; + bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override + { + return *pos != ',' && *pos != ']' && *pos != ' ' && *pos != '\t'; + } - const FormatSettings format_settings; + bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; + void skipField(size_t file_column) override; + void skipHeaderRow(); + void skipNames() override { skipHeaderRow(); } + void skipTypes() override { skipHeaderRow(); } + void skipRowStartDelimiter() override; + void skipFieldDelimiter() override; + void skipRowEndDelimiter() override; - using OptionalIndexes = std::vector>; - OptionalIndexes column_indexes_for_input_fields; + std::vector readHeaderRow(); + std::vector readNames() override { return readHeaderRow(); } + std::vector readTypes() override { return readHeaderRow(); } + String readFieldIntoString(); - DataTypes data_types; - std::vector read_columns; - std::vector not_seen_columns; - - /// This is for the correct exceptions in skipping unknown fields. - std::vector names_of_columns; - - /// For *WithNamesAndTypes formats. - bool with_names; - /// For JSONCompactString* formats. bool yield_strings; }; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp index 1ce4277023d..cdff7ff2070 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB @@ -13,12 +14,10 @@ JSONCompactEachRowRowOutputFormat::JSONCompactEachRowRowOutputFormat(WriteBuffer const RowOutputFormatParams & params_, const FormatSettings & settings_, bool with_names_, + bool with_types_, bool yield_strings_) - : IRowOutputFormat(header_, out_, params_), settings(settings_), with_names(with_names_), yield_strings(yield_strings_) + : IRowOutputFormat(header_, out_, params_), settings(settings_), with_names(with_names_), with_types(with_types_), yield_strings(yield_strings_) { - const auto & sample = getPort(PortKind::Main).getHeader(); - NamesAndTypesList columns(sample.getNamesAndTypesList()); - fields.assign(columns.begin(), columns.end()); } @@ -57,39 +56,40 @@ void JSONCompactEachRowRowOutputFormat::writeTotals(const Columns & columns, siz { writeChar('\n', out); size_t num_columns = columns.size(); - writeChar('[', out); + writeRowStartDelimiter(); for (size_t i = 0; i < num_columns; ++i) { if (i != 0) - JSONCompactEachRowRowOutputFormat::writeFieldDelimiter(); + writeFieldDelimiter(); - JSONCompactEachRowRowOutputFormat::writeField(*columns[i], *serializations[i], row_num); + writeField(*columns[i], *serializations[i], row_num); } - writeCString("]\n", out); + writeRowEndDelimiter(); +} + +void JSONCompactEachRowRowOutputFormat::writeLine(const std::vector & values) +{ + writeRowStartDelimiter(); + for (size_t i = 0; i < values.size(); ++i) + { + writeChar('\"', out); + writeString(values[i], out); + writeChar('\"', out); + if (i != values.size() - 1) + writeFieldDelimiter(); + } + writeRowEndDelimiter(); } void JSONCompactEachRowRowOutputFormat::doWritePrefix() { + const auto & header = getPort(PortKind::Main).getHeader(); + if (with_names) - { - writeChar('[', out); - for (size_t i = 0; i < fields.size(); ++i) - { - writeChar('\"', out); - writeString(fields[i].name, out); - writeChar('\"', out); - if (i != fields.size() - 1) - writeCString(", ", out); - } - writeCString("]\n[", out); - for (size_t i = 0; i < fields.size(); ++i) - { - writeJSONString(fields[i].type->getName(), out, settings); - if (i != fields.size() - 1) - writeCString(", ", out); - } - writeCString("]\n", out); - } + writeLine(header.getNames()); + + if (with_types) + writeLine(header.getDataTypeNames()); } void JSONCompactEachRowRowOutputFormat::consumeTotals(DB::Chunk chunk) @@ -100,45 +100,24 @@ void JSONCompactEachRowRowOutputFormat::consumeTotals(DB::Chunk chunk) void registerOutputFormatJSONCompactEachRow(FormatFactory & factory) { - factory.registerOutputFormat("JSONCompactEachRow", []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & format_settings) + for (bool yield_strings : {false, true}) { - return std::make_shared(buf, sample, params, format_settings, false, false); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRow"); + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerOutputFormat(format_name, [yield_strings, with_names, with_types]( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings & format_settings) + { + return std::make_shared(buf, sample, params, format_settings, with_names, with_types, yield_strings); + }); - factory.registerOutputFormat("JSONCompactEachRowWithNamesAndTypes", []( - WriteBuffer &buf, - const Block &sample, - const RowOutputFormatParams & params, - const FormatSettings &format_settings) - { - return std::make_shared(buf, sample, params, format_settings, true, false); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRowWithNamesAndTypes"); + factory.markOutputFormatSupportsParallelFormatting(format_name); + }; - factory.registerOutputFormat("JSONCompactStringsEachRow", []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & format_settings) - { - return std::make_shared(buf, sample, params, format_settings, false, true); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRow"); - - factory.registerOutputFormat("JSONCompactStringsEachRowWithNamesAndTypes", []( - WriteBuffer &buf, - const Block &sample, - const RowOutputFormatParams & params, - const FormatSettings &format_settings) - { - return std::make_shared(buf, sample, params, format_settings, true, true); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRowWithNamesAndTypes"); + registerWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h index 792eb906f4b..aa12ba7e809 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h @@ -21,15 +21,14 @@ public: const RowOutputFormatParams & params_, const FormatSettings & settings_, bool with_names_, + bool with_types_, bool yield_strings_); String getName() const override { return "JSONCompactEachRowRowOutputFormat"; } void doWritePrefix() override; - void writeBeforeTotals() override {} void writeTotals(const Columns & columns, size_t row_num) override; - void writeAfterTotals() override {} void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; @@ -42,11 +41,11 @@ protected: void consumeExtremes(Chunk) override {} private: + void writeLine(const std::vector & values); + FormatSettings settings; - - NamesAndTypes fields; - bool with_names; + bool with_types; bool yield_strings; }; } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index d04ba2a49e4..28481313974 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -136,37 +136,10 @@ void JSONEachRowRowInputFormat::readField(size_t index, MutableColumns & columns if (seen_columns[index]) throw Exception("Duplicate field found while parsing JSONEachRow format: " + columnName(index), ErrorCodes::INCORRECT_DATA); - try - { - seen_columns[index] = read_columns[index] = true; - const auto & type = getPort().getHeader().getByPosition(index).type; - const auto & serialization = serializations[index]; - - if (yield_strings) - { - String str; - readJSONString(str, *in); - - ReadBufferFromString buf(str); - - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization); - else - serialization->deserializeWholeText(*columns[index], buf, format_settings); - } - else - { - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], *in, format_settings, serialization); - else - serialization->deserializeTextJSON(*columns[index], *in, format_settings); - } - } - catch (Exception & e) - { - e.addMessage("(while reading the value of key " + columnName(index) + ")"); - throw; - } + seen_columns[index] = true; + const auto & type = getPort().getHeader().getByPosition(index).type; + const auto & serialization = serializations[index]; + read_columns[index] = readFieldImpl(*in, *columns[index], type, serialization, columnName(index), format_settings, yield_strings); } inline bool JSONEachRowRowInputFormat::advanceToNextKey(size_t key_index) @@ -282,8 +255,13 @@ bool JSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi if (!seen_columns[i]) header.getByPosition(i).type->insertDefaultInto(*columns[i]); - /// return info about defaults set - ext.read_columns = read_columns; + /// Return info about defaults set. + /// If defaults_for_omitted_fields is set to 0, we should just leave already inserted defaults. + if (format_settings.defaults_for_omitted_fields) + ext.read_columns = read_columns; + else + ext.read_columns.assign(read_columns.size(), true); + return true; } @@ -355,8 +333,8 @@ void registerInputFormatJSONEachRow(FormatFactory & factory) void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory) { - factory.registerFileSegmentationEngine("JSONEachRow", &fileSegmentationEngineJSONEachRowImpl); - factory.registerFileSegmentationEngine("JSONStringsEachRow", &fileSegmentationEngineJSONEachRowImpl); + factory.registerFileSegmentationEngine("JSONEachRow", &fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("JSONStringsEachRow", &fileSegmentationEngineJSONEachRow); } void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 8cb0fce609e..62c0eaa457e 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -60,7 +60,7 @@ RegexpRowInputFormat::ColumnFormat RegexpRowInputFormat::stringToFormat(const St bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) { const auto & type = getPort().getHeader().getByPosition(index).type; - bool parse_as_nullable = format_settings.null_as_default && !type->isNullable(); + bool parse_as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); bool read = true; ReadBuffer field_buf(const_cast(matched_fields[index].data()), matched_fields[index].size(), 0); try @@ -94,9 +94,9 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) break; case ColumnFormat::Raw: if (parse_as_nullable) - read = SerializationNullable::deserializeWholeTextImpl(*columns[index], field_buf, format_settings, serialization); + read = SerializationNullable::deserializeTextRawImpl(*columns[index], field_buf, format_settings, serialization); else - serialization->deserializeWholeText(*columns[index], field_buf, format_settings); + serialization->deserializeTextRaw(*columns[index], field_buf, format_settings); break; default: break; diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 331d6e435d1..606c67aa0d1 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -143,7 +143,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex seen_columns[index] = read_columns[index] = true; const auto & type = getPort().getHeader().getByPosition(index).type; const auto & serialization = serializations[index]; - if (format_settings.null_as_default && !type->isNullable()) + if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable()) read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization); else serialization->deserializeTextEscaped(*columns[index], *in, format_settings); diff --git a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp index 6161303d23a..14dec8420a8 100644 --- a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp @@ -8,7 +8,7 @@ namespace DB { TSKVRowOutputFormat::TSKVRowOutputFormat(WriteBuffer & out_, const Block & header, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : TabSeparatedRowOutputFormat(out_, header, false, false, params_, format_settings_) + : TabSeparatedRowOutputFormat(out_, header, false, false, false, params_, format_settings_) { const auto & sample = getPort(PortKind::Main).getHeader(); NamesAndTypesList columns(sample.getNamesAndTypesList()); diff --git a/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h deleted file mode 100644 index 3e12388bede..00000000000 --- a/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -#include -#include -#include -#include - - -namespace DB -{ - -/** A stream to input data in tsv format, but without escaping individual values. - * It only supports columns without '\n' or '\t' - */ -class TabSeparatedRawRowInputFormat : public TabSeparatedRowInputFormat -{ -public: - /** with_names - the first line is the header with the names of the columns - * with_types - on the next line header with type names - */ - TabSeparatedRawRowInputFormat( - const Block & header_, - ReadBuffer & in_, - const Params & params_, - bool with_names_, - bool with_types_, - const FormatSettings & format_settings_) - : TabSeparatedRowInputFormat(header_, in_, params_, with_names_, with_types_, format_settings_) - { - } - - String getName() const override { return "TabSeparatedRawRowInputFormat"; } - - bool readField(IColumn & column, const DataTypePtr &, const SerializationPtr & serialization, bool) override - { - String tmp; - - while (!in->eof()) - { - char * pos = find_first_symbols<'\n', '\t'>(in->position(), in->buffer().end()); - - tmp.append(in->position(), pos - in->position()); - in->position() = pos; - - if (pos == in->buffer().end()) - in->next(); - else - break; - } - - ReadBufferFromString cell(tmp); - serialization->deserializeWholeText(column, cell, format_settings); - - return true; - } -}; - -} diff --git a/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h deleted file mode 100644 index dc9312e53bc..00000000000 --- a/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ - -/** A stream for outputting data in tsv format, but without escaping individual values. - * (That is, the output is irreversible.) - */ -class TabSeparatedRawRowOutputFormat : public TabSeparatedRowOutputFormat -{ -public: - TabSeparatedRawRowOutputFormat( - WriteBuffer & out_, - const Block & header_, - bool with_names_, - bool with_types_, - const RowOutputFormatParams & params_, - const FormatSettings & format_settings_) - : TabSeparatedRowOutputFormat(out_, header_, with_names_, with_types_, params_, format_settings_) - { - } - - String getName() const override { return "TabSeparatedRawRowOutputFormat"; } - - void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override - { - serialization.serializeText(column, row_num, out, format_settings); - } -}; - -} diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index c1cf0a904ea..1e6d238b202 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -1,13 +1,11 @@ #include -#include -#include #include +#include #include -#include #include #include -#include +#include #include #include @@ -20,19 +18,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - -static void skipTSVRow(ReadBuffer & in, const size_t num_columns) -{ - NullOutput null_sink; - - for (size_t i = 0; i < num_columns; ++i) - { - readEscapedStringInto(null_sink, in); - assertChar(i == num_columns - 1 ? '\n' : '\t', in); - } -} - - /** Check for a common error case - usage of Windows line feed. */ static void checkForCarriageReturn(ReadBuffer & in) @@ -45,188 +30,73 @@ static void checkForCarriageReturn(ReadBuffer & in) ErrorCodes::INCORRECT_DATA); } - -TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) +TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, + bool with_types_, + bool is_raw_, + const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_) { - const auto & sample = getPort().getHeader(); - size_t num_columns = sample.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = sample.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - } - - column_mapping->column_indexes_for_input_fields.reserve(num_columns); - column_mapping->read_columns.assign(num_columns, false); } - -void TabSeparatedRowInputFormat::setupAllColumnsByTableSchema() +void TabSeparatedRowInputFormat::skipFieldDelimiter() { - const auto & header = getPort().getHeader(); - column_mapping->read_columns.assign(header.columns(), true); - column_mapping->column_indexes_for_input_fields.resize(header.columns()); - - for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i) - column_mapping->column_indexes_for_input_fields[i] = i; + assertChar('\t', *in); } - -void TabSeparatedRowInputFormat::addInputColumn(const String & column_name) -{ - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_mapping->column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - - throw Exception( - "Unknown field found in TSV header: '" + column_name + "' " + - "at position " + std::to_string(column_mapping->column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (column_mapping->read_columns[column_index]) - throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA); - - column_mapping->read_columns[column_index] = true; - column_mapping->column_indexes_for_input_fields.emplace_back(column_index); -} - - -void TabSeparatedRowInputFormat::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension) -{ - /// It is safe to memorize this on the first run - the format guarantees this does not change - if (unlikely(row_num == 1)) - { - columns_to_fill_with_default_values.clear(); - for (size_t index = 0; index < column_mapping->read_columns.size(); ++index) - if (column_mapping->read_columns[index] == 0) - columns_to_fill_with_default_values.push_back(index); - } - - for (const auto column_index : columns_to_fill_with_default_values) - { - data_types[column_index]->insertDefaultInto(*columns[column_index]); - row_read_extension.read_columns[column_index] = false; - } -} - - -void TabSeparatedRowInputFormat::readPrefix() -{ - if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8()) - { - /// In this format, we assume that column name or type cannot contain BOM, - /// so, if format has header, - /// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it. - skipBOMIfExists(*in); - } - - /// This is a bit of abstraction leakage, but we have almost the same code in other places. - /// Thus, we check if this InputFormat is working with the "real" beginning of the data in case of parallel parsing. - if (with_names && getCurrentUnitNumber() == 0) - { - if (format_settings.with_names_use_header) - { - String column_name; - for (;;) - { - readEscapedString(column_name, *in); - if (!checkChar('\t', *in)) - { - /// Check last column for \r before adding it, otherwise an error will be: - /// "Unknown field found in TSV header" - checkForCarriageReturn(*in); - addInputColumn(column_name); - break; - } - else - addInputColumn(column_name); - } - - - if (!in->eof()) - { - assertChar('\n', *in); - } - } - else - { - setupAllColumnsByTableSchema(); - skipTSVRow(*in, column_mapping->column_indexes_for_input_fields.size()); - } - } - else if (!column_mapping->is_set) - setupAllColumnsByTableSchema(); - - if (with_types) - { - skipTSVRow(*in, column_mapping->column_indexes_for_input_fields.size()); - } -} - - -bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext) +void TabSeparatedRowInputFormat::skipRowEndDelimiter() { if (in->eof()) - return false; + return; - updateDiagnosticInfo(); + if (unlikely(row_num <= 1)) + checkForCarriageReturn(*in); - ext.read_columns.assign(column_mapping->read_columns.size(), true); - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) - { - const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - if (column_index) - { - const auto & type = data_types[*column_index]; - ext.read_columns[*column_index] = readField(*columns[*column_index], type, serializations[*column_index], is_last_file_column); - } - else - { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); - } - - /// skip separators - if (file_column + 1 < column_mapping->column_indexes_for_input_fields.size()) - { - assertChar('\t', *in); - } - else if (!in->eof()) - { - if (unlikely(row_num == 1)) - checkForCarriageReturn(*in); - - assertChar('\n', *in); - } - } - - fillUnreadColumnsWithDefaults(columns, ext); - - return true; + assertChar('\n', *in); } +String TabSeparatedRowInputFormat::readFieldIntoString() +{ + String field; + readEscapedString(field, *in); + return field; +} + +void TabSeparatedRowInputFormat::skipField() +{ + NullOutput null_sink; + readEscapedStringInto(null_sink, *in); +} + +void TabSeparatedRowInputFormat::skipHeaderRow() +{ + do + { + skipField(); + } + while (checkChar('\t', *in)); + + skipRowEndDelimiter(); +} + +std::vector TabSeparatedRowInputFormat::readHeaderRow() +{ + std::vector fields; + do + { + fields.push_back(readFieldIntoString()); + } + while (checkChar('\t', *in)); + + skipRowEndDelimiter(); + return fields; +} bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, - const SerializationPtr & serialization, bool is_last_file_column) + const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t'; const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n'); @@ -236,137 +106,110 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & column.insertDefault(); return false; } - else if (format_settings.null_as_default && !type->isNullable()) + + bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); + + if (is_raw) + { + if (as_nullable) + return SerializationNullable::deserializeTextRawImpl(column, *in, format_settings, serialization); + + serialization->deserializeTextRaw(column, *in, format_settings); + return true; + } + + if (as_nullable) return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization); serialization->deserializeTextEscaped(column, *in, format_settings); return true; } -bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) +bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + try { - if (file_column == 0 && in->eof()) + assertChar('\t', *in); + } + catch (const DB::Exception &) + { + if (*in->position() == '\n') { - out << "\n"; - return false; + out << "ERROR: Line feed found where tab is expected." + " It's like your file has less columns than expected.\n" + "And if your file has the right number of columns, " + "maybe it has an unescaped backslash in value before tab, which causes the tab to be escaped.\n"; } - - if (column_mapping->column_indexes_for_input_fields[file_column].has_value()) + else if (*in->position() == '\r') { - const auto & header = getPort().getHeader(); - size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value(); - if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], - out, file_column)) - return false; + out << "ERROR: Carriage return found where tab is expected.\n"; } else { - static const String skipped_column_str = ""; - static const DataTypePtr skipped_column_type = std::make_shared(); - static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); - if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) - return false; - } - - /// Delimiters - if (file_column + 1 == column_mapping->column_indexes_for_input_fields.size()) - { - if (!in->eof()) - { - try - { - assertChar('\n', *in); - } - catch (const DB::Exception &) - { - if (*in->position() == '\t') - { - out << "ERROR: Tab found where line feed is expected." - " It's like your file has more columns than expected.\n" - "And if your file has the right number of columns, maybe it has an unescaped tab in a value.\n"; - } - else if (*in->position() == '\r') - { - out << "ERROR: Carriage return found where line feed is expected." - " It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n"; - } - else - { - out << "ERROR: There is no line feed. "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n"; - } - return false; - } - } - } - else - { - try - { - assertChar('\t', *in); - } - catch (const DB::Exception &) - { - if (*in->position() == '\n') - { - out << "ERROR: Line feed found where tab is expected." - " It's like your file has less columns than expected.\n" - "And if your file has the right number of columns, " - "maybe it has an unescaped backslash in value before tab, which causes the tab to be escaped.\n"; - } - else if (*in->position() == '\r') - { - out << "ERROR: Carriage return found where tab is expected.\n"; - } - else - { - out << "ERROR: There is no tab. "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n"; - } - return false; - } + out << "ERROR: There is no tab. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; } + return false; } return true; } -void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) +bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { - const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; - if (index) - { - bool can_be_parsed_as_null = removeLowCardinality(type)->isNullable(); + if (in->eof()) + return true; - // check null value for type is not nullable. don't cross buffer bound for simplicity, so maybe missing some case - if (!can_be_parsed_as_null && !in->eof()) + try + { + assertChar('\n', *in); + } + catch (const DB::Exception &) + { + if (*in->position() == '\t') { - if (*in->position() == '\\' && in->available() >= 2) + out << "ERROR: Tab found where line feed is expected." + " It's like your file has more columns than expected.\n" + "And if your file has the right number of columns, maybe it has an unescaped tab in a value.\n"; + } + else if (*in->position() == '\r') + { + out << "ERROR: Carriage return found where line feed is expected." + " It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n"; + } + else + { + out << "ERROR: There is no line feed. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; + } + return false; + } + + return true; +} + +void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type) +{ + bool can_be_parsed_as_null = type->isNullable() || type->isLowCardinalityNullable() || format_settings.null_as_default; + + // check null value for type is not nullable. don't cross buffer bound for simplicity, so maybe missing some case + if (!can_be_parsed_as_null && !in->eof()) + { + if (*in->position() == '\\' && in->available() >= 2) + { + ++in->position(); + if (*in->position() == 'N') { ++in->position(); - if (*in->position() == 'N') - { - ++in->position(); - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected NULL value of not Nullable type {}", type->getName()); - } - else - { - --in->position(); - } + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected NULL value of not Nullable type {}", type->getName()); + } + else + { + --in->position(); } } - - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column); - } - else - { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); } } @@ -375,67 +218,28 @@ void TabSeparatedRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } -void TabSeparatedRowInputFormat::resetParser() -{ - RowInputFormatWithDiagnosticInfo::resetParser(); - const auto & sample = getPort().getHeader(); - column_mapping->read_columns.assign(sample.columns(), false); - column_mapping->column_indexes_for_input_fields.clear(); - columns_to_fill_with_default_values.clear(); -} - void registerInputFormatTabSeparated(FormatFactory & factory) { - for (const auto * name : {"TabSeparated", "TSV"}) + for (bool is_raw : {false, true}) { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - return std::make_shared(sample, buf, params, false, false, settings); - }); - } + factory.registerInputFormat(format_name, [with_names, with_types, is_raw]( + ReadBuffer & buf, + const Block & sample, + IRowInputFormat::Params params, + const FormatSettings & settings) + { + return std::make_shared(sample, buf, std::move(params), with_names, with_types, is_raw, settings); + }); + }; - for (const auto * name : {"TabSeparatedRaw", "TSVRaw"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(sample, buf, params, false, false, settings); - }); - } - - for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(sample, buf, params, true, false, settings); - }); - } - - for (const auto * name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(sample, buf, params, true, true, settings); - }); + registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); } } -static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) +static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, bool is_raw, size_t min_rows) { bool need_more_data = true; char * pos = in.position(); @@ -443,13 +247,18 @@ static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer while (loadAtPosition(in, memory, pos) && need_more_data) { - pos = find_first_symbols<'\\', '\r', '\n'>(pos, in.buffer().end()); + if (is_raw) + pos = find_first_symbols<'\r', '\n'>(pos, in.buffer().end()); + else + pos = find_first_symbols<'\\', '\r', '\n'>(pos, in.buffer().end()); if (pos > in.buffer().end()) - throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); - else if (pos == in.buffer().end()) + throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); + + if (pos == in.buffer().end()) continue; - else if (*pos == '\\') + + if (!is_raw && *pos == '\\') { ++pos; if (loadAtPosition(in, memory, pos)) @@ -460,7 +269,7 @@ static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer if (*pos == '\n') ++number_of_rows; - if (memory.size() + static_cast(pos - in.position()) >= min_chunk_size) + if ((memory.size() + static_cast(pos - in.position()) >= min_chunk_size) && number_of_rows >= min_rows) need_more_data = false; ++pos; } @@ -473,11 +282,29 @@ static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer void registerFileSegmentationEngineTabSeparated(FormatFactory & factory) { - // We can use the same segmentation engine for TSKV. - for (const auto & name : {"TabSeparated", "TSV", "TSKV", "TabSeparatedWithNames", "TSVWithNames"}) + for (bool is_raw : {false, true}) { - factory.registerFileSegmentationEngine(name, &fileSegmentationEngineTabSeparatedImpl); + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + size_t min_rows = 1 + int(with_names) + int(with_types); + factory.registerFileSegmentationEngine(format_name, [is_raw, min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) + { + return fileSegmentationEngineTabSeparatedImpl(in, memory, min_chunk_size, is_raw, min_rows); + }); + }; + + registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); } + + // We can use the same segmentation engine for TSKV. + factory.registerFileSegmentationEngine("TSKV", []( + ReadBuffer & in, + DB::Memory<> & memory, + size_t min_chunk_size) + { + return fileSegmentationEngineTabSeparatedImpl(in, memory, min_chunk_size, false, 1); + }); } } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 8127b5ceba7..11a788bc900 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB @@ -10,46 +10,43 @@ namespace DB /** A stream to input data in tsv format. */ -class TabSeparatedRowInputFormat : public RowInputFormatWithDiagnosticInfo +class TabSeparatedRowInputFormat : public RowInputFormatWithNamesAndTypes { public: /** with_names - the first line is the header with the names of the columns * with_types - on the next line header with type names */ TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_); + bool with_names_, bool with_types_, bool is_raw, const FormatSettings & format_settings_); String getName() const override { return "TabSeparatedRowInputFormat"; } - bool readRow(MutableColumns & columns, RowReadExtension &) override; - void readPrefix() override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - void resetParser() override; - -protected: - bool with_names; - bool with_types; - const FormatSettings format_settings; - - virtual bool readField(IColumn & column, const DataTypePtr & type, - const SerializationPtr & serialization, bool is_last_file_column); - private: - DataTypes data_types; + bool is_raw; - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; + bool readField(IColumn & column, const DataTypePtr & type, + const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - std::vector columns_to_fill_with_default_values; + void skipField(size_t /*file_column*/) override { skipField(); } + void skipField(); + void skipHeaderRow(); + void skipNames() override { skipHeaderRow(); } + void skipTypes() override { skipHeaderRow(); } + void skipFieldDelimiter() override; + void skipRowEndDelimiter() override; - void addInputColumn(const String & column_name); - void setupAllColumnsByTableSchema(); - void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension); + std::vector readHeaderRow(); + std::vector readNames() override { return readHeaderRow(); } + std::vector readTypes() override { return readHeaderRow(); } + String readFieldIntoString(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; + void checkNullValueForNonNullable(DataTypePtr type) override; + + bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; + bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } }; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index 71d5bdba355..df0c19ad409 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include #include @@ -11,41 +11,43 @@ TabSeparatedRowOutputFormat::TabSeparatedRowOutputFormat( const Block & header_, bool with_names_, bool with_types_, + bool is_raw_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) + : IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), is_raw(is_raw_), format_settings(format_settings_) { } +void TabSeparatedRowOutputFormat::writeLine(const std::vector & values) +{ + for (size_t i = 0; i < values.size(); ++i) + { + writeEscapedString(values[i], out); + if (i + 1 == values.size()) + writeRowEndDelimiter(); + else + writeFieldDelimiter(); + } +} void TabSeparatedRowOutputFormat::doWritePrefix() { const auto & header = getPort(PortKind::Main).getHeader(); - size_t columns = header.columns(); if (with_names) - { - for (size_t i = 0; i < columns; ++i) - { - writeEscapedString(header.safeGetByPosition(i).name, out); - writeChar(i == columns - 1 ? '\n' : '\t', out); - } - } + writeLine(header.getNames()); if (with_types) - { - for (size_t i = 0; i < columns; ++i) - { - writeEscapedString(header.safeGetByPosition(i).type->getName(), out); - writeChar(i == columns - 1 ? '\n' : '\t', out); - } - } + writeLine(header.getDataTypeNames()); } void TabSeparatedRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - serialization.serializeTextEscaped(column, row_num, out, format_settings); + if (is_raw) + serialization.serializeTextRaw(column, row_num, out, format_settings); + else + serialization.serializeTextEscaped(column, row_num, out, format_settings); } @@ -75,56 +77,24 @@ void TabSeparatedRowOutputFormat::writeBeforeExtremes() void registerOutputFormatTabSeparated(FormatFactory & factory) { - for (const auto * name : {"TabSeparated", "TSV"}) + for (bool is_raw : {false, true}) { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - return std::make_shared(buf, sample, false, false, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); - } + factory.registerOutputFormat(format_name, [is_raw, with_names, with_types]( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, with_names, with_types, is_raw, params, settings); + }); - for (const auto * name : {"TabSeparatedRaw", "TSVRaw"}) - { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, false, false, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); - } + factory.markOutputFormatSupportsParallelFormatting(format_name); + }; - for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"}) - { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, true, false, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); - } - - for (const auto * name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"}) - { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, true, true, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); + registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); } } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h index e3190be70e8..7dcc6529f1c 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h @@ -23,6 +23,7 @@ public: const Block & header_, bool with_names_, bool with_types_, + bool is_raw_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_); @@ -39,10 +40,13 @@ public: /// https://www.iana.org/assignments/media-types/text/tab-separated-values String getContentType() const override { return "text/tab-separated-values; charset=UTF-8"; } -protected: - +private: + void writeLine(const std::vector & values); bool with_names; bool with_types; + bool is_raw; + +protected: const FormatSettings format_settings; }; diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index ed98ab372b6..db5db4701a9 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -130,7 +130,7 @@ void TemplateBlockOutputFormat::serializeField(const IColumn & column, const ISe serialization.serializeTextXML(column, row_num, out, settings); break; case ColumnFormat::Raw: - serialization.serializeText(column, row_num, out, settings); + serialization.serializeTextRaw(column, row_num, out, settings); break; default: __builtin_unreachable(); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 021b2532b39..c096b62e967 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -45,8 +45,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer } else { - if (format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw) - format.throwInvalidFormat("XML and Raw deserialization is not supported", i); + if (format.formats[i] == ColumnFormat::Xml) + format.throwInvalidFormat("XML deserialization is not supported", i); } } @@ -54,8 +54,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) - row_format.throwInvalidFormat("XML and Raw deserialization is not supported", i); + if (row_format.formats[i] == ColumnFormat::Xml) + row_format.throwInvalidFormat("XML deserialization is not supported", i); if (row_format.format_idx_to_column_idx[i]) { @@ -194,7 +194,7 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, { ColumnFormat col_format = row_format.formats[file_column]; bool read = true; - bool parse_as_nullable = settings.null_as_default && !type->isNullable(); + bool parse_as_nullable = settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); try { switch (col_format) @@ -226,6 +226,12 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, else serialization->deserializeTextJSON(column, buf, settings); break; + case ColumnFormat::Raw: + if (parse_as_nullable) + read = SerializationNullable::deserializeTextRawImpl(column, buf, settings, serialization); + else + serialization->deserializeTextRaw(column, buf, settings); + break; default: __builtin_unreachable(); } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index c15d0d608ee..5f471dc0151 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -180,7 +180,7 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx) bool read = true; const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; - if (format_settings.null_as_default && !type->isNullable()) + if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable()) read = SerializationNullable::deserializeTextQuotedImpl(column, *buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, *buf, format_settings); @@ -421,7 +421,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx Field value = convertFieldToType(expression_value, type, value_raw.second.get()); /// Check that we are indeed allowed to insert a NULL. - if (value.isNull() && !type.isNullable()) + if (value.isNull() && !type.isNullable() && !type.isLowCardinalityNullable()) { if (format_settings.null_as_default) { diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp new file mode 100644 index 00000000000..b5690d9dafb --- /dev/null +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -0,0 +1,260 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + +RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_), format_settings(format_settings_), with_names(with_names_), with_types(with_types_) +{ + const auto & sample = getPort().getHeader(); + size_t num_columns = sample.columns(); + + data_types.resize(num_columns); + column_indexes_by_names.reserve(num_columns); + + for (size_t i = 0; i < num_columns; ++i) + { + const auto & column_info = sample.getByPosition(i); + + data_types[i] = column_info.type; + column_indexes_by_names.emplace(column_info.name, i); + } +} + +void RowInputFormatWithNamesAndTypes::setupAllColumnsByTableSchema() +{ + const auto & header = getPort().getHeader(); + column_mapping->column_indexes_for_input_fields.resize(header.columns()); + column_mapping->names_of_columns = header.getNames(); + + for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i) + column_mapping->column_indexes_for_input_fields[i] = i; +} + +void RowInputFormatWithNamesAndTypes::addInputColumn(const String & column_name, std::vector & read_columns) +{ + column_mapping->names_of_columns.push_back(column_name); + + const auto column_it = column_indexes_by_names.find(column_name); + if (column_it == column_indexes_by_names.end()) + { + if (format_settings.skip_unknown_fields) + { + column_mapping->column_indexes_for_input_fields.push_back(std::nullopt); + return; + } + + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Unknown field found in {} header: '{}' at position {}\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", + getName(), column_name, column_mapping->column_indexes_for_input_fields.size()); + } + + const auto column_index = column_it->second; + + if (read_columns[column_index]) + throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA); + + read_columns[column_index] = true; + column_mapping->column_indexes_for_input_fields.emplace_back(column_index); +} + +void RowInputFormatWithNamesAndTypes::readPrefix() +{ + if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8()) + { + /// We assume that column name or type cannot contain BOM, so, if format has header, + /// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it. + skipBOMIfExists(*in); + } + + /// This is a bit of abstraction leakage, but we need it in parallel parsing: + /// we check if this InputFormat is working with the "real" beginning of the data. + if (with_names && getCurrentUnitNumber() == 0) + { + if (format_settings.with_names_use_header) + { + std::vector read_columns(data_types.size(), false); + auto column_names = readNames(); + for (const auto & name : column_names) + addInputColumn(name, read_columns); + + for (size_t i = 0; i != read_columns.size(); ++i) + { + if (!read_columns[i]) + column_mapping->not_presented_columns.push_back(i); + } + } + else + { + setupAllColumnsByTableSchema(); + skipNames(); + } + } + else if (!column_mapping->is_set) + setupAllColumnsByTableSchema(); + + if (with_types && getCurrentUnitNumber() == 0) + { + if (format_settings.with_types_use_header) + { + auto types = readTypes(); + if (types.size() != column_mapping->column_indexes_for_input_fields.size()) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "The number of data types differs from the number of column names in input data"); + + /// Check that types from input matches types from header. + for (size_t i = 0; i < types.size(); ++i) + { + if (column_mapping->column_indexes_for_input_fields[i] && + data_types[*column_mapping->column_indexes_for_input_fields[i]]->getName() != types[i]) + { + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Type of '{}' must be {}, not {}", + getPort().getHeader().getByPosition(*column_mapping->column_indexes_for_input_fields[i]).name, + data_types[*column_mapping->column_indexes_for_input_fields[i]]->getName(), types[i]); + } + } + } + else + skipTypes(); + } +} + +void RowInputFormatWithNamesAndTypes::insertDefaultsForNotSeenColumns(MutableColumns & columns, RowReadExtension & ext) +{ + for (auto index : column_mapping->not_presented_columns) + { + columns[index]->insertDefault(); + ext.read_columns[index] = false; + } +} + +bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadExtension & ext) +{ + if (in->eof()) + return false; + + updateDiagnosticInfo(); + skipRowStartDelimiter(); + + ext.read_columns.resize(data_types.size()); + for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + { + const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; + const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); + if (column_index) + ext.read_columns[*column_index] = readField( + *columns[*column_index], + data_types[*column_index], + serializations[*column_index], + is_last_file_column, + column_mapping->names_of_columns[file_column]); + else + skipField(file_column); + + if (!is_last_file_column) + skipFieldDelimiter(); + } + + skipRowEndDelimiter(); + + insertDefaultsForNotSeenColumns(columns, ext); + + /// If defaults_for_omitted_fields is set to 0, we should leave already inserted defaults. + if (!format_settings.defaults_for_omitted_fields) + ext.read_columns.assign(ext.read_columns.size(), true); + + return true; +} + +void RowInputFormatWithNamesAndTypes::resetParser() +{ + RowInputFormatWithDiagnosticInfo::resetParser(); + column_mapping->column_indexes_for_input_fields.clear(); + column_mapping->not_presented_columns.clear(); + column_mapping->names_of_columns.clear(); +} + +void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) +{ + const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; + if (index) + { + checkNullValueForNonNullable(type); + const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); + readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); + } + else + { + skipField(file_column); + } +} + +bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) +{ + if (in->eof()) + { + out << "\n"; + return false; + } + + if (!parseRowStartWithDiagnosticInfo(out)) + return false; + + for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + { + if (column_mapping->column_indexes_for_input_fields[file_column].has_value()) + { + const auto & header = getPort().getHeader(); + size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value(); + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, file_column)) + return false; + } + else + { + static const String skipped_column_str = ""; + static const DataTypePtr skipped_column_type = std::make_shared(); + static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) + return false; + } + + /// Delimiters + if (file_column + 1 != column_mapping->column_indexes_for_input_fields.size()) + { + if (!parseFieldDelimiterWithDiagnosticInfo(out)) + return false; + } + } + + return parseRowEndWithDiagnosticInfo(out); +} + + +void registerFileSegmentationEngineForFormatWithNamesAndTypes( + FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine) +{ + factory.registerFileSegmentationEngine(base_format_name, segmentation_engine); + factory.registerFileSegmentationEngine(base_format_name + "WithNames", segmentation_engine); + factory.registerFileSegmentationEngine(base_format_name + "WithNamesAndTypes", segmentation_engine); +} + + +} diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h new file mode 100644 index 00000000000..0fd83238f5f --- /dev/null +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -0,0 +1,85 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +/// Base class for input formats with -WithNames and -WithNamesAndTypes suffixes. +/// It accepts 2 parameters in constructor - with_names and with_types and implements +/// input format depending on them: +/// - if with_names is true, it will expect that the first row of data contains column +/// names. If setting input_format_with_names_use_header is set to 1, columns mapping +/// will be performed. +/// - if with_types is true, it will expect that the second row of data contains column +/// types. If setting input_format_with_types_use_header is set to 1, types from input +/// will be compared types from header. +/// It's important that firstly this class reads/skips names and only +/// then reads/skips types. So you can this invariant. +class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo +{ +public: + /** with_names - in the first line the header with column names + * with_types - in the second line the header with column names + */ + RowInputFormatWithNamesAndTypes( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, bool with_types_, const FormatSettings & format_settings_); + + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; + void readPrefix() override; + void resetParser() override; + +protected: + /// Read single field from input. Return false if there was no real value and we inserted default value. + virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; + + /// Skip single field, it's used to skip unknown columns. + virtual void skipField(size_t file_column) = 0; + /// Skip the whole row with names. + virtual void skipNames() = 0; + /// Skip the whole row with types. + virtual void skipTypes() = 0; + + /// Skip delimiters, if any. + virtual void skipRowStartDelimiter() {} + virtual void skipFieldDelimiter() {} + virtual void skipRowEndDelimiter() {} + + + /// Methods for parsing with diagnostic info. + virtual void checkNullValueForNonNullable(DataTypePtr) {} + virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} + bool isGarbageAfterField(size_t, ReadBuffer::Position) override {return false; } + + /// Read row with names and return the list of them. + virtual std::vector readNames() = 0; + /// Read row with types and return the list of them. + virtual std::vector readTypes() = 0; + + const FormatSettings format_settings; + DataTypes data_types; + +private: + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; + void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; + + void setupAllColumnsByTableSchema(); + void addInputColumn(const String & column_name, std::vector & read_columns); + void insertDefaultsForNotSeenColumns(MutableColumns & columns, RowReadExtension & ext); + + bool with_names; + bool with_types; + std::unordered_map column_indexes_by_names; +}; + +void registerFileSegmentationEngineForFormatWithNamesAndTypes( + FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine); + +} diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index dd851f19906..021335fea1f 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -218,7 +218,7 @@ bool IStorage::isStaticStorage() const return false; } -BackupEntries IStorage::backup(const ASTs &, ContextPtr) const +BackupEntries IStorage::backup(const ASTs &, ContextPtr) { throw Exception("Table engine " + getName() + " doesn't support backups", ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 6c4bb44b63b..fa5f2c28b06 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -205,7 +205,7 @@ public: NameDependencies getDependentViewsByColumn(ContextPtr context) const; /// Prepares entries to backup data of the storage. - virtual BackupEntries backup(const ASTs & partitions, ContextPtr context) const; + virtual BackupEntries backup(const ASTs & partitions, ContextPtr context); /// Extract data from the backup and put it to the storage. virtual RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context); diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index d53faf144f9..dda7f235d97 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -333,8 +333,9 @@ const KeyCondition::AtomMap KeyCondition::atom_map [] (RPNElement & out, const Field &) { out.function = RPNElement::FUNCTION_IS_NULL; - // When using NULL_LAST, isNull means [+Inf, +Inf] - out.range = Range(Field(POSITIVE_INFINITY)); + // isNull means +Inf (NULLS_LAST) or -Inf (NULLS_FIRST), + // which is equivalent to not in Range (-Inf, +Inf) + out.range = Range(); return true; } } @@ -2002,7 +2003,10 @@ BoolMask KeyCondition::checkInHyperrectangle( /// No need to apply monotonic functions as nulls are kept. bool intersects = element.range.intersectsRange(*key_range); bool contains = element.range.containsRange(*key_range); + rpn_stack.emplace_back(intersects, !contains); + if (element.function == RPNElement::FUNCTION_IS_NULL) + rpn_stack.back() = !rpn_stack.back(); } else if ( element.function == RPNElement::FUNCTION_IN_SET diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index daa496fa517..c5c14cd24a8 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3500,7 +3500,7 @@ Pipe MergeTreeData::alterPartition( } -BackupEntries MergeTreeData::backup(const ASTs & partitions, ContextPtr local_context) const +BackupEntries MergeTreeData::backup(const ASTs & partitions, ContextPtr local_context) { DataPartsVector data_parts; if (partitions.empty()) @@ -3522,7 +3522,7 @@ BackupEntries MergeTreeData::backupDataParts(const DataPartsVector & data_parts) auto temp_dir_it = temp_dirs.find(disk); if (temp_dir_it == temp_dirs.end()) - temp_dir_it = temp_dirs.emplace(disk, std::make_shared(disk, "tmp_backup_")).first; + temp_dir_it = temp_dirs.emplace(disk, std::make_shared(disk, "tmp/backup_")).first; auto temp_dir_owner = temp_dir_it->second; fs::path temp_dir = temp_dir_owner->getPath(); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 22ec7ce6f53..1b617a2ec71 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -630,7 +630,7 @@ public: TableLockHolder & table_lock_holder); /// Prepares entries to backup data of the storage. - BackupEntries backup(const ASTs & partitions, ContextPtr context) const override; + BackupEntries backup(const ASTs & partitions, ContextPtr context) override; static BackupEntries backupDataParts(const DataPartsVector & data_parts); /// Extract data from the backup and put it to the storage. diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 301d81919c1..297c0397642 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,11 @@ #include #include +#include +#include +#include +#include + #include #include @@ -47,6 +53,7 @@ namespace ErrorCodes extern const int SIZES_OF_MARKS_FILES_ARE_INCONSISTENT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int INCORRECT_FILE_NAME; + extern const int NOT_IMPLEMENTED; } /// NOTE: The lock `StorageLog::rwlock` is NOT kept locked while reading, @@ -880,6 +887,162 @@ IStorage::ColumnSizeByName StorageLog::getColumnSizes() const } +BackupEntries StorageLog::backup(const ASTs & partitions, ContextPtr context) +{ + if (!partitions.empty()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); + + auto lock_timeout = getLockTimeout(context); + loadMarks(lock_timeout); + + ReadLock lock{rwlock, lock_timeout}; + if (!lock) + throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); + + if (!num_data_files || !file_checker.getFileSize(data_files[INDEX_WITH_REAL_ROW_COUNT].path)) + return {}; + + auto temp_dir_owner = std::make_shared(disk, "tmp/backup_"); + auto temp_dir = temp_dir_owner->getPath(); + disk->createDirectories(temp_dir); + + BackupEntries backup_entries; + + /// *.bin + for (const auto & data_file : data_files) + { + /// We make a copy of the data file because it can be changed later in write() or in truncate(). + String data_file_name = fileName(data_file.path); + String temp_file_path = temp_dir + "/" + data_file_name; + disk->copy(data_file.path, disk, temp_file_path); + backup_entries.emplace_back( + data_file_name, + std::make_unique( + disk, temp_file_path, file_checker.getFileSize(data_file.path), std::nullopt, temp_dir_owner)); + } + + /// __marks.mrk + if (use_marks_file) + { + /// We make a copy of the data file because it can be changed later in write() or in truncate(). + String marks_file_name = fileName(marks_file_path); + String temp_file_path = temp_dir + "/" + marks_file_name; + disk->copy(marks_file_path, disk, temp_file_path); + backup_entries.emplace_back( + marks_file_name, + std::make_unique( + disk, temp_file_path, file_checker.getFileSize(marks_file_path), std::nullopt, temp_dir_owner)); + } + + /// sizes.json + String files_info_path = file_checker.getPath(); + backup_entries.emplace_back(fileName(files_info_path), std::make_unique(disk, files_info_path)); + + /// columns.txt + backup_entries.emplace_back( + "columns.txt", std::make_unique(getInMemoryMetadata().getColumns().getAllPhysical().toString())); + + /// count.txt + if (use_marks_file) + { + size_t num_rows = data_files[INDEX_WITH_REAL_ROW_COUNT].marks.empty() ? 0 : data_files[INDEX_WITH_REAL_ROW_COUNT].marks.back().rows; + backup_entries.emplace_back("count.txt", std::make_unique(toString(num_rows))); + } + + return backup_entries; +} + +RestoreDataTasks StorageLog::restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) +{ + if (!partitions.empty()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); + + auto restore_task = [this, backup, data_path_in_backup, context]() + { + auto lock_timeout = getLockTimeout(context); + WriteLock lock{rwlock, lock_timeout}; + if (!lock) + throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); + + if (!num_data_files) + return; + + /// Load the marks if not loaded yet. We have to do that now because we're going to update these marks. + loadMarks(lock); + + /// If there were no files, save zero file sizes to be able to rollback in case of error. + saveFileSizes(lock); + + try + { + /// Append data files. + for (const auto & data_file : data_files) + { + String file_path_in_backup = data_path_in_backup + fileName(data_file.path); + auto backup_entry = backup->read(file_path_in_backup); + auto in = backup_entry->getReadBuffer(); + auto out = disk->writeFile(data_file.path, max_compress_block_size, WriteMode::Append); + copyData(*in, *out); + } + + if (use_marks_file) + { + /// Append marks. + size_t num_extra_marks = 0; + String file_path_in_backup = data_path_in_backup + fileName(marks_file_path); + size_t file_size = backup->getSize(file_path_in_backup); + if (file_size % (num_data_files * sizeof(Mark)) != 0) + throw Exception("Size of marks file is inconsistent", ErrorCodes::SIZES_OF_MARKS_FILES_ARE_INCONSISTENT); + + num_extra_marks = file_size / (num_data_files * sizeof(Mark)); + + size_t num_marks = data_files[0].marks.size(); + for (auto & data_file : data_files) + data_file.marks.reserve(num_marks + num_extra_marks); + + std::vector old_data_sizes; + std::vector old_num_rows; + old_data_sizes.resize(num_data_files); + old_num_rows.resize(num_data_files); + for (size_t i = 0; i != num_data_files; ++i) + { + old_data_sizes[i] = file_checker.getFileSize(data_files[i].path); + old_num_rows[i] = num_marks ? data_files[i].marks[num_marks - 1].rows : 0; + } + + auto backup_entry = backup->read(file_path_in_backup); + auto marks_rb = backup_entry->getReadBuffer(); + + for (size_t i = 0; i != num_extra_marks; ++i) + { + for (size_t j = 0; j != num_data_files; ++j) + { + Mark mark; + mark.read(*marks_rb); + mark.rows += old_num_rows[j]; /// Adjust the number of rows. + mark.offset += old_data_sizes[j]; /// Adjust the offset. + data_files[j].marks.push_back(mark); + } + } + } + + /// Finish writing. + saveMarks(lock); + saveFileSizes(lock); + } + catch (...) + { + /// Rollback partial writes. + file_checker.repair(); + removeUnsavedMarks(lock); + throw; + } + + }; + return {restore_task}; +} + + void registerStorageLog(StorageFactory & factory) { StorageFactory::StorageFeatures features{ diff --git a/src/Storages/StorageLog.h b/src/Storages/StorageLog.h index ca87d7dcf3e..8b2ef0ccac1 100644 --- a/src/Storages/StorageLog.h +++ b/src/Storages/StorageLog.h @@ -51,6 +51,9 @@ public: bool supportsSubcolumns() const override { return true; } ColumnSizeByName getColumnSizes() const override; + BackupEntries backup(const ASTs & partitions, ContextPtr context) override; + RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) override; + protected: /** Attach the table with the appropriate name, along the appropriate path (with / at the end), * (the correctness of names and paths is not verified) diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index da620463ffa..92664a29767 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -9,10 +9,12 @@ #include #include +#include #include #include #include #include +#include #include #include @@ -33,6 +35,13 @@ #include #include +#include +#include +#include +#include + +#include + #include @@ -44,6 +53,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int INCORRECT_FILE_NAME; extern const int TIMEOUT_EXCEEDED; + extern const int NOT_IMPLEMENTED; } @@ -481,6 +491,134 @@ void StorageStripeLog::saveFileSizes(const WriteLock & /* already locked for wri } +BackupEntries StorageStripeLog::backup(const ASTs & partitions, ContextPtr context) +{ + if (!partitions.empty()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); + + auto lock_timeout = getLockTimeout(context); + loadIndices(lock_timeout); + + ReadLock lock{rwlock, lock_timeout}; + if (!lock) + throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); + + if (!file_checker.getFileSize(data_file_path)) + return {}; + + auto temp_dir_owner = std::make_shared(disk, "tmp/backup_"); + auto temp_dir = temp_dir_owner->getPath(); + disk->createDirectories(temp_dir); + + BackupEntries backup_entries; + + /// data.bin + { + /// We make a copy of the data file because it can be changed later in write() or in truncate(). + String data_file_name = fileName(data_file_path); + String temp_file_path = temp_dir + "/" + data_file_name; + disk->copy(data_file_path, disk, temp_file_path); + backup_entries.emplace_back( + data_file_name, + std::make_unique( + disk, temp_file_path, file_checker.getFileSize(data_file_path), std::nullopt, temp_dir_owner)); + } + + /// index.mrk + { + /// We make a copy of the data file because it can be changed later in write() or in truncate(). + String index_file_name = fileName(index_file_path); + String temp_file_path = temp_dir + "/" + index_file_name; + disk->copy(index_file_path, disk, temp_file_path); + backup_entries.emplace_back( + index_file_name, + std::make_unique( + disk, temp_file_path, file_checker.getFileSize(index_file_path), std::nullopt, temp_dir_owner)); + } + + /// sizes.json + String files_info_path = file_checker.getPath(); + backup_entries.emplace_back(fileName(files_info_path), std::make_unique(disk, files_info_path)); + + /// columns.txt + backup_entries.emplace_back( + "columns.txt", std::make_unique(getInMemoryMetadata().getColumns().getAllPhysical().toString())); + + /// count.txt + size_t num_rows = 0; + for (const auto & block : indices.blocks) + num_rows += block.num_rows; + backup_entries.emplace_back("count.txt", std::make_unique(toString(num_rows))); + + return backup_entries; +} + +RestoreDataTasks StorageStripeLog::restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) +{ + if (!partitions.empty()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); + + auto restore_task = [this, backup, data_path_in_backup, context]() + { + WriteLock lock{rwlock, getLockTimeout(context)}; + if (!lock) + throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); + + /// Load the indices if not loaded yet. We have to do that now because we're going to update these indices. + loadIndices(lock); + + /// If there were no files, save zero file sizes to be able to rollback in case of error. + saveFileSizes(lock); + + try + { + /// Append the data file. + auto old_data_size = file_checker.getFileSize(data_file_path); + { + String file_path_in_backup = data_path_in_backup + fileName(data_file_path); + auto backup_entry = backup->read(file_path_in_backup); + auto in = backup_entry->getReadBuffer(); + auto out = disk->writeFile(data_file_path, max_compress_block_size, WriteMode::Append); + copyData(*in, *out); + } + + /// Append the index. + String index_path_in_backup = data_path_in_backup + fileName(index_file_path); + if (backup->exists(index_path_in_backup)) + { + IndexForNativeFormat extra_indices; + auto backup_entry = backup->read(index_path_in_backup); + auto index_in = backup_entry->getReadBuffer(); + CompressedReadBuffer index_compressed_in{*index_in}; + extra_indices.read(index_compressed_in); + + /// Adjust the offsets. + for (auto & block : extra_indices.blocks) + { + for (auto & column : block.columns) + column.location.offset_in_compressed_file += old_data_size; + } + + insertAtEnd(indices.blocks, std::move(extra_indices.blocks)); + } + + /// Finish writing. + saveIndices(lock); + saveFileSizes(lock); + } + catch (...) + { + /// Rollback partial writes. + file_checker.repair(); + removeUnsavedIndices(lock); + throw; + } + + }; + return {restore_task}; +} + + void registerStorageStripeLog(StorageFactory & factory) { StorageFactory::StorageFeatures features{ diff --git a/src/Storages/StorageStripeLog.h b/src/Storages/StorageStripeLog.h index 7ab40f867dd..579e2f991e7 100644 --- a/src/Storages/StorageStripeLog.h +++ b/src/Storages/StorageStripeLog.h @@ -50,6 +50,9 @@ public: void truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder&) override; + BackupEntries backup(const ASTs & partitions, ContextPtr context) override; + RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) override; + protected: StorageStripeLog( DiskPtr disk_, @@ -92,7 +95,7 @@ private: const size_t max_compress_block_size; - std::shared_timed_mutex rwlock; + mutable std::shared_timed_mutex rwlock; Poco::Logger * log; }; diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index a097e08871d..c33fa6cad44 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -26,6 +26,7 @@ const char * auto_contributors[] { "Aleksandra (Ася)", "Aleksandr Karo", "Aleksandrov Vladimir", + "Aleksandr Shalimov", "alekseik1", "Aleksei Levushkin", "Aleksei Semiglazov", @@ -36,6 +37,7 @@ const char * auto_contributors[] { "Alexander Avdonkin", "Alexander Bezpiatov", "Alexander Burmak", + "Alexander Chashnikov", "Alexander Ermolaev", "Alexander Gololobov", "Alexander GQ Gerasiov", @@ -101,9 +103,11 @@ const char * auto_contributors[] { "ana-uvarova", "AnaUvarova", "Andr0901", + "andrc1901", "Andreas Hunkeler", "AndreevDm", "Andrei Bodrov", + "Andrei Ch", "Andrei Chulkov", "andrei-karpliuk", "Andrei Nekrashevich", @@ -127,6 +131,7 @@ const char * auto_contributors[] { "Anmol Arora", "Anna", "Anna Shakhova", + "anneji", "anneji-dev", "annvsh", "anrodigina", @@ -181,6 +186,7 @@ const char * auto_contributors[] { "BanyRule", "Baudouin Giard", "BayoNet", + "bbkas", "benamazing", "benbiti", "Benjamin Naecker", @@ -190,6 +196,7 @@ const char * auto_contributors[] { "bharatnc", "Big Elephant", "Bill", + "BiteTheDDDDt", "BlahGeek", "blazerer", "bluebirddm", @@ -220,6 +227,7 @@ const char * auto_contributors[] { "Chao Wang", "chasingegg", "chengy8934", + "chenjian", "chenqi", "chenxing-xc", "chenxing.xc", @@ -232,7 +240,9 @@ const char * auto_contributors[] { "Ciprian Hacman", "Clement Rodriguez", "Clément Rodriguez", + "ClickHouse Admin", "cn-ds", + "Cody Baker", "Colum", "comunodi", "Constantin S. Pan", @@ -343,6 +353,7 @@ const char * auto_contributors[] { "fastio", "favstovol", "FawnD2", + "Federico Ceratto", "FeehanG", "feihengye", "felixoid", @@ -372,6 +383,7 @@ const char * auto_contributors[] { "fuwhu", "Fu Zhe", "fuzhe1989", + "fuzzERot", "Gagan Arneja", "Gao Qiang", "g-arslan", @@ -407,6 +419,7 @@ const char * auto_contributors[] { "heng zhao", "hermano", "hexiaoting", + "hhell", "Hiroaki Nakamura", "hotid", "huangzhaowei", @@ -456,6 +469,7 @@ const char * auto_contributors[] { "Ivan Kush", "Ivan Kushnarenko", "Ivan Lezhankin", + "Ivan Milov", "Ivan Remen", "Ivan Starkov", "ivanzhukov", @@ -470,6 +484,7 @@ const char * auto_contributors[] { "jasine", "Jason", "Jason Keirstead", + "jasperzhu", "javartisan", "javi", "javi santana", @@ -483,6 +498,7 @@ const char * auto_contributors[] { "Jiang Tao", "jianmei zhang", "jkuklis", + "João Figueiredo", "Jochen Schalanda", "John", "John Hummel", @@ -492,9 +508,11 @@ const char * auto_contributors[] { "jyz0309", "Kang Liu", "Karl Pietrzak", + "karnevil13", "keenwolf", "Keiji Yoshida", "Ken Chen", + "Ken MacInnis", "Kevin Chiang", "Kevin Michel", "kevin wan", @@ -509,6 +527,7 @@ const char * auto_contributors[] { "KochetovNicolai", "kolsys", "Konstantin Grabar", + "Konstantin Ilchenko", "Konstantin Lebedev", "Konstantin Malanchev", "Konstantin Podshumok", @@ -529,6 +548,7 @@ const char * auto_contributors[] { "l1tsolaiki", "lalex", "Latysheva Alexandra", + "laurieliyang", "lehasm", "Léo Ercolanelli", "Leonardo Cecchi", @@ -539,6 +559,9 @@ const char * auto_contributors[] { "levushkin aleksej", "levysh", "Lewinma", + "lhuang0928", + "lhuang09287750", + "liang.huang", "liangqian", "libenwang", "lichengxiang", @@ -593,6 +616,7 @@ const char * auto_contributors[] { "mastertheknife", "Matthew Peveler", "Matwey V. Kornilov", + "Mátyás Jani", "Max", "Max Akhmedov", "Max Bruce", @@ -613,11 +637,13 @@ const char * auto_contributors[] { "maxkuzn", "maxulan", "Max Vetrov", + "MaxWk", "Mc.Spring", "mehanizm", "MeiK", "melin", "memo", + "Memo", "meo", "meoww-bot", "mergify[bot]", @@ -636,6 +662,7 @@ const char * auto_contributors[] { "Miguel Fernández", "miha-g", "Mihail Fandyushin", + "mikael", "Mikahil Nacharov", "Mike", "Mike F", @@ -658,6 +685,8 @@ const char * auto_contributors[] { "millb", "Misko Lee", "mnkonkova", + "mo-avatar", + "Mohamad Fadhil", "Mohammad Hossein Sekhavat", "morty", "moscas", @@ -698,6 +727,7 @@ const char * auto_contributors[] { "nikitamikhaylov", "Nikita Mikhaylov", "Nikita Orlov", + "Nikita Tikhomirov", "Nikita Vasilev", "Nikolai Kochetov", "Nikolai Sorokin", @@ -739,15 +769,18 @@ const char * auto_contributors[] { "Paramtamtam", "Patrick Zippenfenig", "Pavel", + "Pavel Cheremushkin", "Pavel Kartaviy", "Pavel Kartavyy", "Pavel Kovalenko", "Pavel Kruglov", "Pavel Litvinenko", + "Pavel Medvedev", "Pavel Patrin", "Pavel Yakunin", "Pavlo Bashynskiy", "Pawel Rog", + "pawelsz-rb", "pdv-ru", "Peng Jian", "Persiyanov Dmitriy Andreevich", @@ -770,6 +803,7 @@ const char * auto_contributors[] { "qianmoQ", "quid", "Quid37", + "quoctan132", "r1j1k", "Rafael David Tinoco", "rainbowsysu", @@ -789,6 +823,7 @@ const char * auto_contributors[] { "robot-clickhouse", "robot-metrika-test", "rodrigargar", + "Rohit Agarwal", "Romain Neutron", "roman", "Roman Bug", @@ -816,6 +851,7 @@ const char * auto_contributors[] { "Saulius Valatka", "sdk2", "Sean Haynes", + "Sébastien", "Sébastien Launay", "serebrserg", "Sergei Bocharov", @@ -840,10 +876,12 @@ const char * auto_contributors[] { "sev7e0", "SevaCode", "sevirov", + "Seyed Mehrshad Hosseini", "sfod", "shangshujie", "shedx", "Sherry Wang", + "Shoh Jahon", "Silviu Caragea", "Simeon Emanuilov", "Simon Liu", @@ -887,16 +925,23 @@ const char * auto_contributors[] { "Taleh Zaliyev", "Tangaev", "tao jiang", + "Tatiana", "Tatiana Kirillova", "tavplubix", "TCeason", + "Teja", + "Teja Srivastasa", "Tema Novikov", "templarzq", + "Tentoshka", "terrylin", "The-Alchemist", "Thomas Berdy", + "Thom O'Connor", + "tianzhou", "Tiaonmmn", "tiger.yan", + "Tigran Khudaverdyan", "tison", "TiunovNN", "Tobias Adamson", @@ -934,7 +979,9 @@ const char * auto_contributors[] { "Veloman Yunkan", "Veniamin Gvozdikov", "Veselkov Konstantin", + "vesslanjin", "vgocoder", + "Viachaslau Boben", "vic", "vicdashkov", "vicgao", @@ -950,6 +997,7 @@ const char * auto_contributors[] { "Vitaliy Zakaznikov", "Vitaly", "Vitaly Baranov", + "Vitaly Orlov", "Vitaly Samigullin", "Vitaly Stoyan", "vitstn", @@ -980,6 +1028,7 @@ const char * auto_contributors[] { "vzakaznikov", "wangchao", "Wang Fenjin", + "WangZengrui", "weeds085490", "Weiqing Xu", "William Shallum", @@ -994,6 +1043,7 @@ const char * auto_contributors[] { "Yangkuan Liu", "yangshuai", "Yatsishin Ilya", + "yeer", "Yegor Andreenko", "Yegor Levankov", "ygrek", @@ -1023,6 +1073,7 @@ const char * auto_contributors[] { "ywill3", "zamulla", "zhang2014", + "zhanglistar", "zhangshengyu", "zhangxiao018", "zhangxiao871", @@ -1051,6 +1102,7 @@ const char * auto_contributors[] { "Смитюх Вячеслав", "Сундуков Алексей", "万康", + "凌涛", "吴健", "小路", "张中南", @@ -1058,8 +1110,10 @@ const char * auto_contributors[] { "张风啸", "徐炘", "曲正鹏", + "木木夕120", "未来星___费", "极客青年", + "枢木", "董海镔", "谢磊", "贾顺名(Jarvis)", diff --git a/tests/ci/metrics_lambda/app.py b/tests/ci/metrics_lambda/app.py index af0e0fe07f1..d88e3e45b07 100644 --- a/tests/ci/metrics_lambda/app.py +++ b/tests/ci/metrics_lambda/app.py @@ -79,7 +79,7 @@ def list_runners(access_token): def group_runners_by_tag(listed_runners): result = {} - RUNNER_TYPE_LABELS = ['style-checker', 'builder', 'func-tester'] + RUNNER_TYPE_LABELS = ['style-checker', 'builder', 'func-tester', 'stress-tester'] for runner in listed_runners: for tag in runner.tags: if tag in RUNNER_TYPE_LABELS: diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py new file mode 100644 index 00000000000..a7730285025 --- /dev/null +++ b/tests/ci/stress_check.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + +#!/usr/bin/env python3 + +import csv +import logging +import subprocess +import os +import json +import time +import sys + +from github import Github +import requests + +from report import create_test_html_report +from s3_helper import S3Helper +from get_robot_token import get_best_robot_token +from pr_info import PRInfo + + +DOWNLOAD_RETRIES_COUNT = 5 + +def dowload_build_with_progress(url, path): + logging.info("Downloading from %s to temp path %s", url, path) + for i in range(DOWNLOAD_RETRIES_COUNT): + try: + with open(path, 'wb') as f: + response = requests.get(url, stream=True) + response.raise_for_status() + total_length = response.headers.get('content-length') + if total_length is None or int(total_length) == 0: + logging.info("No content-length, will download file without progress") + f.write(response.content) + else: + dl = 0 + total_length = int(total_length) + logging.info("Content length is %ld bytes", total_length) + for data in response.iter_content(chunk_size=4096): + dl += len(data) + f.write(data) + if sys.stdout.isatty(): + done = int(50 * dl / total_length) + percent = int(100 * float(dl) / total_length) + eq_str = '=' * done + space_str = ' ' * (50 - done) + sys.stdout.write(f"\r[{eq_str}{space_str}] {percent}%") + sys.stdout.flush() + break + except Exception as ex: + sys.stdout.write("\n") + time.sleep(3) + logging.info("Exception while downloading %s, retry %s", ex, i + 1) + if os.path.exists(path): + os.remove(path) + else: + raise Exception(f"Cannot download dataset from {url}, all retries exceeded") + + sys.stdout.write("\n") + logging.info("Downloading finished") + + +def download_builds(result_path, build_urls): + for url in build_urls: + if url.endswith('.deb'): + fname = os.path.basename(url) + logging.info("Will download %s to %s", fname, result_path) + dowload_build_with_progress(url, os.path.join(result_path, fname)) + +def get_build_config(build_number, repo_path): + ci_config_path = os.path.join(repo_path, "tests/ci/ci_config.json") + with open(ci_config_path, 'r', encoding='utf-8') as ci_config: + config_dict = json.load(ci_config) + return config_dict['build_config'][build_number] + +def get_build_urls(build_config_str, reports_path): + for root, _, files in os.walk(reports_path): + for f in files: + if build_config_str in f : + logging.info("Found build report json %s", f) + with open(os.path.join(root, f), 'r', encoding='utf-8') as file_handler: + build_report = json.load(file_handler) + return build_report['build_urls'] + return [] + +def build_config_to_string(build_config): + if build_config["package-type"] == "performance": + return "performance" + + return "_".join([ + build_config['compiler'], + build_config['build-type'] if build_config['build-type'] else "relwithdebuginfo", + build_config['sanitizer'] if build_config['sanitizer'] else "none", + build_config['bundled'], + build_config['splitted'], + "tidy" if build_config['tidy'] == "enable" else "notidy", + "with_coverage" if build_config['with_coverage'] else "without_coverage", + build_config['package-type'], + ]) + + +def get_run_command(build_path, result_folder, server_log_folder, image): + cmd = "docker run -e S3_URL='https://clickhouse-datasets.s3.amazonaws.com' " + \ + f"--volume={build_path}:/package_folder " \ + f"--volume={result_folder}:/test_output " \ + f"--volume={server_log_folder}:/var/log/clickhouse-server {image}" + + return cmd + +def process_logs(s3_client, additional_logs, s3_path_prefix): + additional_urls = [] + for log_path in additional_logs: + if log_path: + additional_urls.append( + s3_client.upload_test_report_to_s3( + log_path, + s3_path_prefix + "/" + os.path.basename(log_path))) + + return additional_urls + +def upload_results(s3_client, pr_number, commit_sha, test_results, raw_log, additional_files, check_name): + additional_files = [raw_log] + additional_files + s3_path_prefix = f"{pr_number}/{commit_sha}/" + check_name.lower().replace(' ', '_').replace('(', '_').replace(')', '_').replace(',', '_') + additional_urls = process_logs(s3_client, additional_files, s3_path_prefix) + + branch_url = "https://github.com/ClickHouse/ClickHouse/commits/master" + branch_name = "master" + if pr_number != 0: + branch_name = f"PR #{pr_number}" + branch_url = f"https://github.com/ClickHouse/ClickHouse/pull/{pr_number}" + commit_url = f"https://github.com/ClickHouse/ClickHouse/commit/{commit_sha}" + + task_url = f"https://github.com/ClickHouse/ClickHouse/actions/runs/{os.getenv('GITHUB_RUN_ID')}" + + raw_log_url = additional_urls[0] + additional_urls.pop(0) + + html_report = create_test_html_report(check_name, test_results, raw_log_url, task_url, branch_url, branch_name, commit_url, additional_urls, True) + with open('report.html', 'w', encoding='utf-8') as f: + f.write(html_report) + + url = s3_client.upload_test_report_to_s3('report.html', s3_path_prefix + ".html") + logging.info("Search result in url %s", url) + return url + +def get_commit(gh, commit_sha): + repo = gh.get_repo(os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse")) + commit = repo.get_commit(commit_sha) + return commit + +def process_results(result_folder, server_log_path, run_log_path): + test_results = [] + additional_files = [] + # Just upload all files from result_folder. + # If task provides processed results, then it's responsible for content of result_folder. + if os.path.exists(result_folder): + test_files = [f for f in os.listdir(result_folder) if os.path.isfile(os.path.join(result_folder, f))] + additional_files = [os.path.join(result_folder, f) for f in test_files] + + if os.path.exists(server_log_path): + server_log_files = [f for f in os.listdir(server_log_path) if os.path.isfile(os.path.join(server_log_path, f))] + additional_files = additional_files + [os.path.join(server_log_path, f) for f in server_log_files] + + additional_files.append(run_log_path) + + status_path = os.path.join(result_folder, "check_status.tsv") + if not os.path.exists(status_path): + return "failure", "check_status.tsv doesn't exists", test_results, additional_files + + logging.info("Found check_status.tsv") + with open(status_path, 'r', encoding='utf-8') as status_file: + status = list(csv.reader(status_file, delimiter='\t')) + + if len(status) != 1 or len(status[0]) != 2: + return "error", "Invalid check_status.tsv", test_results, additional_files + state, description = status[0][0], status[0][1] + + results_path = os.path.join(result_folder, "test_results.tsv") + with open(results_path, 'r', encoding='utf-8') as results_file: + test_results = list(csv.reader(results_file, delimiter='\t')) + if len(test_results) == 0: + raise Exception("Empty results") + + return state, description, test_results, additional_files + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + temp_path = os.getenv("TEMP_PATH", os.path.abspath(".")) + repo_path = os.getenv("REPO_COPY", os.path.abspath("../../")) + reports_path = os.getenv("REPORTS_PATH", "./reports") + + check_name = sys.argv[1] + build_number = int(sys.argv[2]) + + if not os.path.exists(temp_path): + os.makedirs(temp_path) + + with open(os.getenv('GITHUB_EVENT_PATH'), 'r', encoding='utf-8') as event_file: + event = json.load(event_file) + + pr_info = PRInfo(event) + + gh = Github(get_best_robot_token()) + + for root, _, files in os.walk(reports_path): + for f in files: + if f == 'changed_images.json': + images_path = os.path.join(root, 'changed_images.json') + break + + image_name = "clickhouse/stress-test" + + docker_image = image_name + if images_path and os.path.exists(images_path): + logging.info("Images file exists") + with open(images_path, 'r', encoding='utf-8') as images_fd: + images = json.load(images_fd) + logging.info("Got images %s", images) + if image_name in images: + docker_image += ':' + images[image_name] + else: + logging.info("Images file not found") + + for i in range(10): + try: + logging.info("Pulling image %s", docker_image) + subprocess.check_output(f"docker pull {docker_image}", stderr=subprocess.STDOUT, shell=True) + break + except Exception as ex: + time.sleep(i * 3) + logging.info("Got execption pulling docker %s", ex) + else: + raise Exception(f"Cannot pull dockerhub for image docker pull {docker_image}") + + build_config = get_build_config(build_number, repo_path) + build_config_str = build_config_to_string(build_config) + urls = get_build_urls(build_config_str, reports_path) + if not urls: + raise Exception("No build URLs found") + + packages_path = os.path.join(temp_path, "packages") + if not os.path.exists(packages_path): + os.makedirs(packages_path) + + server_log_path = os.path.join(temp_path, "server_log") + if not os.path.exists(server_log_path): + os.makedirs(server_log_path) + + result_path = os.path.join(temp_path, "result_path") + if not os.path.exists(result_path): + os.makedirs(result_path) + + run_log_path = os.path.join(temp_path, "runlog.log") + + download_builds(packages_path, urls) + run_command = get_run_command(packages_path, result_path, server_log_path, docker_image) + logging.info("Going to run func tests: %s", run_command) + + with open(run_log_path, 'w', encoding='utf-8') as log: + with subprocess.Popen(run_command, shell=True, stderr=log, stdout=log) as process: + retcode = process.wait() + if retcode == 0: + logging.info("Run successfully") + else: + logging.info("Run failed") + + subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) + + s3_helper = S3Helper('https://s3.amazonaws.com') + state, description, test_results, additional_logs = process_results(result_path, server_log_path, run_log_path) + report_url = upload_results(s3_helper, pr_info.number, pr_info.sha, test_results, run_log_path, additional_logs, check_name) + print(f"::notice ::Report url: {report_url}") + commit = get_commit(gh, pr_info.sha) + commit.create_status(context=check_name, description=description, state=state, target_url=report_url) diff --git a/tests/ci/worker/init_func_tester.sh b/tests/ci/worker/init_func_tester.sh new file mode 100644 index 00000000000..b117f11556d --- /dev/null +++ b/tests/ci/worker/init_func_tester.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "Running init script" +export DEBIAN_FRONTEND=noninteractive +export RUNNER_HOME=/home/ubuntu/actions-runner + +echo "Receiving token" +export RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value` +export RUNNER_URL="https://github.com/ClickHouse" +# Funny fact, but metadata service has fixed IP +export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` + +cd $RUNNER_HOME + +echo "Going to configure runner" +sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,func-tester' --work _work + +echo "Run" +sudo -u ubuntu ./run.sh diff --git a/tests/ci/worker/init_stress_tester.sh b/tests/ci/worker/init_stress_tester.sh new file mode 100644 index 00000000000..54ed944b274 --- /dev/null +++ b/tests/ci/worker/init_stress_tester.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "Running init script" +export DEBIAN_FRONTEND=noninteractive +export RUNNER_HOME=/home/ubuntu/actions-runner + +echo "Receiving token" +export RUNNER_TOKEN=`/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value` +export RUNNER_URL="https://github.com/ClickHouse" +# Funny fact, but metadata service has fixed IP +export INSTANCE_ID=`curl -s http://169.254.169.254/latest/meta-data/instance-id` + +cd $RUNNER_HOME + +echo "Going to configure runner" +sudo -u ubuntu ./config.sh --url $RUNNER_URL --token $RUNNER_TOKEN --name $INSTANCE_ID --runnergroup Default --labels 'self-hosted,Linux,X64,stress-tester' --work _work + +echo "Run" +sudo -u ubuntu ./run.sh diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 472ecaf608b..3ba73b3405f 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -6,9 +6,11 @@ cluster = ClickHouseCluster(__file__) instance = cluster.add_instance('instance') -def create_and_fill_table(): +def create_and_fill_table(engine="MergeTree"): + if engine == "MergeTree": + engine = "MergeTree ORDER BY y PARTITION BY x%10" instance.query("CREATE DATABASE test") - instance.query("CREATE TABLE test.table(x UInt32, y String) ENGINE=MergeTree ORDER BY y PARTITION BY x%10") + instance.query(f"CREATE TABLE test.table(x UInt32, y String) ENGINE={engine}") instance.query("INSERT INTO test.table SELECT number, toString(number) FROM numbers(100)") @@ -36,9 +38,11 @@ def new_backup_name(): return f"test-backup-{backup_id_counter}" -def test_restore_table(): + +@pytest.mark.parametrize("engine", ["MergeTree", "Log", "TinyLog", "StripeLog"]) +def test_restore_table(engine): backup_name = new_backup_name() - create_and_fill_table() + create_and_fill_table(engine=engine) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO '{backup_name}'") @@ -50,9 +54,10 @@ def test_restore_table(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" -def test_restore_table_into_existing_table(): +@pytest.mark.parametrize("engine", ["MergeTree", "Log", "TinyLog", "StripeLog"]) +def test_restore_table_into_existing_table(engine): backup_name = new_backup_name() - create_and_fill_table() + create_and_fill_table(engine=engine) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO '{backup_name}'") diff --git a/tests/integration/test_backward_compatibility/test_aggregate_fixed_key.py b/tests/integration/test_backward_compatibility/test_aggregate_fixed_key.py index 8819be527fd..98658ec81d0 100644 --- a/tests/integration/test_backward_compatibility/test_aggregate_fixed_key.py +++ b/tests/integration/test_backward_compatibility/test_aggregate_fixed_key.py @@ -2,7 +2,7 @@ import pytest from helpers.cluster import ClickHouseCluster -cluster = ClickHouseCluster(__file__) +cluster = ClickHouseCluster(__file__, name="aggregate_fixed_key") node1 = cluster.add_instance('node1', with_zookeeper=True, image='yandex/clickhouse-server', tag='21.3', with_installed_binary=True) node2 = cluster.add_instance('node2', with_zookeeper=True, image='yandex/clickhouse-server') node3 = cluster.add_instance('node3', with_zookeeper=True, image='yandex/clickhouse-server') diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 6106966e5b7..4bf778531a7 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -2850,7 +2850,7 @@ def test_kafka_formats_with_broken_message(kafka_cluster): # broken message b'\x05\x02\x69\x64\x07\x62\x6c\x6f\x63\x6b\x4e\x6f\x04\x76\x61\x6c\x31\x04\x76\x61\x6c\x32\x04\x76\x61\x6c\x33\x05\x49\x6e\x74\x36\x34\x06\x53\x74\x72\x69\x6e\x67\x06\x53\x74\x72\x69\x6e\x67\x07\x46\x6c\x6f\x61\x74\x33\x32\x05\x55\x49\x6e\x74\x38\x00\x00\x00\x00\x00\x00\x00\x00\x03\x42\x41\x44\x02\x41\x4d\x00\x00\x00\x3f\x01', ], - 'expected':'{"raw_message":"0502696407626C6F636B4E6F0476616C310476616C320476616C3305496E74363406537472696E6706537472696E6707466C6F617433320555496E743800000000000000000342414402414D0000003F01","error":"Cannot read all data. Bytes read: 9. Bytes expected: 65.: (at row 1)\\n"}', + 'expected':'{"raw_message":"0502696407626C6F636B4E6F0476616C310476616C320476616C3305496E74363406537472696E6706537472696E6707466C6F617433320555496E743800000000000000000342414402414D0000003F01","error":"Type of \'blockNo\' must be UInt16, not String"}', 'printable':False, }, 'ORC': { diff --git a/tests/queries/0_stateless/00300_csv.reference b/tests/queries/0_stateless/00300_csv.reference index 9d2fe7233d8..42cd22078c4 100644 --- a/tests/queries/0_stateless/00300_csv.reference +++ b/tests/queries/0_stateless/00300_csv.reference @@ -1,6 +1,10 @@ +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline +here" "x","y","z","a","b" "Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" +"x","y","z","a","b" +"String","UInt8","Array(UInt8)","Tuple(UInt16, Array(String))","String" "Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" 0,"0","[]","2000-01-01","2000-01-01 00:00:00" diff --git a/tests/queries/0_stateless/00300_csv.sql b/tests/queries/0_stateless/00300_csv.sql index 0c761ad0af1..76b1b29df06 100644 --- a/tests/queries/0_stateless/00300_csv.sql +++ b/tests/queries/0_stateless/00300_csv.sql @@ -1,3 +1,4 @@ -SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNames; SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSV; +SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNames; +SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNamesAndTypes; SELECT number, toString(number), range(number), toDate('2000-01-01') + number, toDateTime('2000-01-01 00:00:00') + number FROM system.numbers LIMIT 10 FORMAT CSV; diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index 0aee9abe25c..39721ce1050 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -13,7 +13,7 @@ Hello "world", 789 ,2016-01-03 "Hello world", 100, 2016-01-04, default,, - default-eof,,' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --query="INSERT INTO csv FORMAT CSV"; + default-eof,,' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY d"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; @@ -33,7 +33,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Europe/Moscow echo 'NULL, NULL "2016-01-01 01:02:03",NUL -"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --query="INSERT INTO csv FORMAT CSV"; +"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; diff --git a/tests/queries/0_stateless/00938_template_input_format.reference b/tests/queries/0_stateless/00938_template_input_format.reference index ce89532886d..e1f77d9a581 100644 --- a/tests/queries/0_stateless/00938_template_input_format.reference +++ b/tests/queries/0_stateless/00938_template_input_format.reference @@ -23,3 +23,11 @@ cv bn m","","as""df'gh","",456,"2016-01-02" "as""df'gh","","zx cv bn m","",789,"2016-01-04" "qwe,rty","","","",9876543210,"2016-01-03" +==== check raw ==== +"qwe,rty","as""df'gh","","zx +cv bn m",123,"2016-01-01" +"as""df\'gh","","zx +cv bn m","qwe,rty",456,"2016-01-02" +"zx\cv\bn m","qwe,rty","as""df'gh","",789,"2016-01-04" +"","zx +cv bn m","qwe,rty","as""df'gh",9876543210,"2016-01-03" diff --git a/tests/queries/0_stateless/00938_template_input_format.sh b/tests/queries/0_stateless/00938_template_input_format.sh index 75616b35af0..9218f4bebca 100755 --- a/tests/queries/0_stateless/00938_template_input_format.sh +++ b/tests/queries/0_stateless/00938_template_input_format.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# Tags: no-parallel + # shellcheck disable=SC2016,SC2028 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) @@ -50,6 +52,30 @@ format_template_rows_between_delimiter = ','"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template2 ORDER BY n FORMAT CSV"; +echo "==== check raw ====" + +echo -ne '{prefix} \n${data}\n $$ suffix $$\n' > "$CURDIR"/00938_template_input_format_resultset.tmp +echo -ne 'n:\t${n:Escaped}, s1:\t${0:Raw}\t, s2:\t${1:Quoted}, s3:\t${s3:JSON}, s4:\t${3:CSV}, d:\t${d:Escaped}\t' > "$CURDIR"/00938_template_input_format_row.tmp + + +$CLICKHOUSE_CLIENT --query="TRUNCATE TABLE template1"; + +echo "{prefix}"' '" +n: 123, s1: qwe,rty , s2: 'as\"df\\'gh', s3: \"\", s4: \"zx +cv bn m\", d: 2016-01-01 ; +n: 456, s1: as\"df\\'gh , s2: '', s3: \"zx\\ncv\\tbn m\", s4: \"qwe,rty\", d: 2016-01-02 ; +n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\", d: 2016-01-03 ; +n: 789, s1: zx\cv\bn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04"$'\t'" + $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template1 FORMAT Template SETTINGS \ +format_template_resultset = '$CURDIR/00938_template_input_format_resultset.tmp', \ +format_template_row = '$CURDIR/00938_template_input_format_row.tmp', \ +format_template_rows_between_delimiter = ';\n'"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT CSV"; + + + $CLICKHOUSE_CLIENT --query="DROP TABLE template1"; $CLICKHOUSE_CLIENT --query="DROP TABLE template2"; rm "$CURDIR"/00938_template_input_format_resultset.tmp "$CURDIR"/00938_template_input_format_row.tmp + diff --git a/tests/queries/0_stateless/01034_JSONCompactEachRow.reference b/tests/queries/0_stateless/01034_JSONCompactEachRow.reference index 6ec53e11fc9..bfc99d688d5 100644 --- a/tests/queries/0_stateless/01034_JSONCompactEachRow.reference +++ b/tests/queries/0_stateless/01034_JSONCompactEachRow.reference @@ -12,6 +12,11 @@ [1, "a"] [2, "b"] [3, "c"] +---------- +["value", "name"] +[1, "a"] +[2, "b"] +[3, "c"] 4 ["name", "c"] ["String", "UInt64"] @@ -31,17 +36,33 @@ 8 ["first", 1, 2, 0] ["second", 2, 0, 6] +["first", 1, 2, 0] +["second", 2, 0, 6] 9 ["first", 1, 2, 8] ["second", 2, 32, 6] +["first", 1, 2, 8] +["second", 2, 32, 6] 10 ["first", 1, 16, 8] ["second", 2, 32, 8] +["first", 1, 16, 8] +["second", 2, 32, 8] 11 ["v1", "v2", "v3", "v4"] ["String", "UInt8", "UInt16", "UInt8"] ["", 2, 3, 1] +["", 2, 3, 1] +---------- +["v1", "v2", "v3", "v4"] +["", 2, 3, 1] +["", 2, 3, 1] 12 ["v1", "n.id", "n.name"] ["UInt8", "Array(UInt8)", "Array(String)"] [16, [15,16,0], ["first","second","third"]] +[16, [15,16,0], ["first","second","third"]] +---------- +["v1", "n.id", "n.name"] +[16, [15,16,0], ["first","second","third"]] +[16, [15,16,0], ["first","second","third"]] diff --git a/tests/queries/0_stateless/01034_JSONCompactEachRow.sql b/tests/queries/0_stateless/01034_JSONCompactEachRow.sql index f5442c90a2a..f71597a60e5 100644 --- a/tests/queries/0_stateless/01034_JSONCompactEachRow.sql +++ b/tests/queries/0_stateless/01034_JSONCompactEachRow.sql @@ -10,8 +10,10 @@ SELECT 2; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactEachRow; SELECT 3; -/* Check JSONCompactEachRowWithNamesAndTypes Output */ +/* Check JSONCompactEachRowWithNames and JSONCompactEachRowWithNamesAndTypes Output */ SELECT * FROM test_table FORMAT JSONCompactEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table FORMAT JSONCompactEachRowWithNames; SELECT 4; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactEachRowWithNamesAndTypes; @@ -35,30 +37,39 @@ INSERT INTO test_table_2 FORMAT JSONCompactEachRow [16, [15, 16, null], ["first" SELECT * FROM test_table_2 FORMAT JSONCompactEachRow; TRUNCATE TABLE test_table_2; SELECT 8; -/* Check JSONCompactEachRowWithNamesAndTypes Output */ +/* Check JSONCompactEachRowWithNamesAndTypes and JSONCompactEachRowWithNamesAndTypes Input */ SET input_format_null_as_default = 0; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", 1, "2", null]["second", 2, null, 6]; +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "v3", "v4"]["first", 1, "2", null]["second", 2, null, 6]; SELECT * FROM test_table FORMAT JSONCompactEachRow; TRUNCATE TABLE test_table; SELECT 9; /* Check input_format_null_as_default = 1 */ SET input_format_null_as_default = 1; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", 1, "2", null] ["second", 2, null, 6]; +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "v3", "v4"]["first", 1, "2", null] ["second", 2, null, 6]; SELECT * FROM test_table FORMAT JSONCompactEachRow; SELECT 10; /* Check Header */ TRUNCATE TABLE test_table; SET input_format_skip_unknown_fields = 1; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "invalid_column"]["String", "UInt8", "UInt8"]["first", 1, 32]["second", 2, "64"]; +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "invalid_column"]["first", 1, 32]["second", 2, "64"]; SELECT * FROM test_table FORMAT JSONCompactEachRow; SELECT 11; TRUNCATE TABLE test_table; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v4", "v2", "v3"]["UInt8", "UInt8", "UInt16"][1, 2, 3] +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v4", "v2", "v3"][1, 2, 3] SELECT * FROM test_table FORMAT JSONCompactEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table FORMAT JSONCompactEachRowWithNames; SELECT 12; /* Check Nested */ INSERT INTO test_table_2 FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "n.id", "n.name"]["UInt8", "Array(UInt8)", "Array(String)"][16, [15, 16, null], ["first", "second", "third"]]; +INSERT INTO test_table_2 FORMAT JSONCompactEachRowWithNames ["v1", "n.id", "n.name"][16, [15, 16, null], ["first", "second", "third"]]; SELECT * FROM test_table_2 FORMAT JSONCompactEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table_2 FORMAT JSONCompactEachRowWithNames; DROP TABLE IF EXISTS test_table; DROP TABLE IF EXISTS test_table_2; diff --git a/tests/queries/0_stateless/01195_formats_diagnostic_info.reference b/tests/queries/0_stateless/01195_formats_diagnostic_info.reference index 15fc31538ce..eddbb80198d 100644 --- a/tests/queries/0_stateless/01195_formats_diagnostic_info.reference +++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.reference @@ -1,5 +1,5 @@ CSV -Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR +Column 2, name: d, type: Decimal(18, 10), parsed text: " 123456789"ERROR ERROR: garbage after DateTime: "7, Hello" ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. ERROR: There is no line feed. "1" found instead. @@ -28,3 +28,14 @@ ERROR: There is no delimiter before field 1: expected "", got "7Hello< ERROR: There is no delimiter after last field: expected "", got "1" ERROR: There is no delimiter after last field: expected "", got "Hello" Column 0, name: t, type: DateTime, ERROR: text "" is not like DateTime +JSONCompactEachRow +Column 2, name: d, type: Decimal(18, 10), parsed text: " 123456789"ERROR +Column 0, name: t, type: DateTime, parsed text: "2020-04-21 12:34:56"ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. +ERROR: garbage after DateTime: "7, Hello" +ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. +ERROR: There is no closing parenthesis (']') at the end of the row. "," found instead. +Column 1, name: s, type: String, parsed text: ERROR +ERROR: There is no '[' before the row. +ERROR: garbage after Decimal(18, 10): ";" +ERROR: There is no comma. ";" found instead. +ERROR: Closing parenthesis (']') found where comma is expected. It's like your file has less columns than expected. diff --git a/tests/queries/0_stateless/01195_formats_diagnostic_info.sh b/tests/queries/0_stateless/01195_formats_diagnostic_info.sh index 6c64b17f719..dde410d95c4 100755 --- a/tests/queries/0_stateless/01195_formats_diagnostic_info.sh +++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.sh @@ -38,3 +38,19 @@ echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ER echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR" echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR" echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" + +PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format JSONCompactEachRow) +echo '["2020-04-21 12:34:56", "Hello", 12345678]' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "JSONCompactEachRow" +echo '["2020-04-21 12:34:56", "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:567", "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:56"7, "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:56", "Hello", 12345678,1]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:56",,123Hello]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678\n]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '"2020-04-21 12:34:56", "Hello", 12345678]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678;' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello"; 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello"\n' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello"]' | "${PARSER[@]}" 2>&1| grep "ERROR" diff --git a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference index 6f1974ccd73..ffea4c736dc 100644 --- a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference +++ b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference @@ -7,7 +7,21 @@ number UInt64 0 1 +TSVRawWithNames +number +0 +1 +TSVRawWithNamesAndTypes +number +UInt64 +0 +1 CSVWithNames "number" 0 1 +CSVWithNamesAndTypes +"number" +"UInt64" +0 +1 diff --git a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh index ad9cc2c53a8..69f3ab1c9a8 100755 --- a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh +++ b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh @@ -15,5 +15,14 @@ ${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNames echo 'TSVWithNamesAndTypes' ${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNamesAndTypes +echo 'TSVRawWithNames' +${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNames + +echo 'TSVRawWithNamesAndTypes' +${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNamesAndTypes + echo 'CSVWithNames' ${CLICKHOUSE_LOCAL} "${opts[@]}" --format CSVWithNames + +echo 'CSVWithNamesAndTypes' +${CLICKHOUSE_LOCAL} "${opts[@]}" --format CSVWithNamesAndTypes diff --git a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference index 4f2a79b9905..78286b89a39 100644 --- a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference +++ b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference @@ -2,6 +2,11 @@ zero rows TSVWithNames TSVWithNamesAndTypes CSVWithNames +CSVWithNamesAndTypes +JSONCompactEachRowWithNames +JSONCompactEachRowWithNamesAndTypes +JSONCompactStringsEachRow +JSONCompactStringsEachRowWithNamesAndTypes multi clickhouse-local one file TSVWithNames 0 @@ -15,3 +20,23 @@ CSVWithNames 0 0 0 +CSVWithNamesAndTypes +0 +0 +0 +JSONCompactEachRowWithNames +0 +0 +0 +JSONCompactEachRowWithNamesAndTypes +0 +0 +0 +JSONCompactStringsEachRow +0 +0 +0 +JSONCompactStringsEachRowWithNamesAndTypes +0 +0 +0 diff --git a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh index 469f7e7008b..a634f689dca 100755 --- a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh +++ b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -6,26 +7,26 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # zero rows echo 'zero rows' -for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames; do +for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames CSVWithNamesAndTypes JSONCompactEachRowWithNames JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRow JSONCompactStringsEachRowWithNamesAndTypes; do echo $format ${CLICKHOUSE_LOCAL} --query=" - CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format.tsv') AS SELECT * FROM numbers(1) WHERE number < 0; + CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format') AS SELECT * FROM numbers(1) WHERE number < 0; SELECT * FROM ${format}_01375; DROP TABLE ${format}_01375; " - rm 01375_$format.tsv + rm 01375_$format done # run multiple times to the same file echo 'multi clickhouse-local one file' -for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames; do +for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames CSVWithNamesAndTypes JSONCompactEachRowWithNames JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRow JSONCompactStringsEachRowWithNamesAndTypes; do echo $format for _ in {1..2}; do ${CLICKHOUSE_LOCAL} --query=" - CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format.tsv') AS SELECT * FROM numbers(1); + CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format') AS SELECT * FROM numbers(1); SELECT * FROM ${format}_01375; DROP TABLE ${format}_01375; " done - rm 01375_$format.tsv + rm 01375_$format done diff --git a/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference b/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference index fb1a066f272..8a69cf26ffd 100644 --- a/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference +++ b/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference @@ -12,6 +12,11 @@ ["1", "a"] ["2", "b"] ["3", "c"] +---------- +["value", "name"] +["1", "a"] +["2", "b"] +["3", "c"] 4 ["name", "c"] ["String", "UInt64"] @@ -31,17 +36,33 @@ 8 ["first", "1", "2", "0"] ["second", "2", "0", "6"] +["first", "1", "2", "0"] +["second", "2", "0", "6"] 9 ["first", "1", "2", "8"] ["second", "2", "32", "6"] +["first", "1", "2", "8"] +["second", "2", "32", "6"] 10 ["first", "1", "16", "8"] ["second", "2", "32", "8"] +["first", "1", "16", "8"] +["second", "2", "32", "8"] 11 ["v1", "v2", "v3", "v4"] ["String", "UInt8", "UInt16", "UInt8"] ["", "2", "3", "1"] +["", "2", "3", "1"] +--------- +["v1", "v2", "v3", "v4"] +["", "2", "3", "1"] +["", "2", "3", "1"] 12 ["v1", "n.id", "n.name"] ["UInt8", "Array(UInt8)", "Array(String)"] ["16", "[15,16,17]", "['first','second','third']"] +["16", "[15,16,17]", "['first','second','third']"] +--------- +["v1", "n.id", "n.name"] +["16", "[15,16,17]", "['first','second','third']"] +["16", "[15,16,17]", "['first','second','third']"] diff --git a/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql b/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql index 925faa3a17f..869041193cf 100644 --- a/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql +++ b/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql @@ -12,8 +12,10 @@ SELECT 2; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactStringsEachRow; SELECT 3; -/* Check JSONCompactStringsEachRowWithNamesAndTypes Output */ +/* Check JSONCompactStringsEachRowWithNames and JSONCompactStringsEachRowWithNamesAndTypes Output */ SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNames; SELECT 4; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactStringsEachRowWithNamesAndTypes; @@ -37,30 +39,39 @@ INSERT INTO test_table_2 FORMAT JSONCompactStringsEachRow ["16", "[15, 16, 17]", SELECT * FROM test_table_2 FORMAT JSONCompactStringsEachRow; TRUNCATE TABLE test_table_2; SELECT 8; -/* Check JSONCompactStringsEachRowWithNamesAndTypes Output */ +/* Check JSONCompactStringsEachRowWithNames and JSONCompactStringsEachRowWithNamesAndTypes Input */ SET input_format_null_as_default = 0; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", "1", "2", "null"]["second", "2", "null", "6"]; +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v1", "v2", "v3", "v4"]["first", "1", "2", "null"]["second", "2", "null", "6"]; SELECT * FROM test_table FORMAT JSONCompactStringsEachRow; TRUNCATE TABLE test_table; SELECT 9; /* Check input_format_null_as_default = 1 */ SET input_format_null_as_default = 1; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", "1", "2", "null"] ["second", "2", "null", "6"]; +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v1", "v2", "v3", "v4"]["first", "1", "2", "null"] ["second", "2", "null", "6"]; SELECT * FROM test_table FORMAT JSONCompactStringsEachRow; SELECT 10; /* Check Header */ TRUNCATE TABLE test_table; SET input_format_skip_unknown_fields = 1; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "v2", "invalid_column"]["String", "UInt8", "UInt8"]["first", "1", "32"]["second", "2", "64"]; +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v1", "v2", "invalid_column"]["first", "1", "32"]["second", "2", "64"]; SELECT * FROM test_table FORMAT JSONCompactStringsEachRow; SELECT 11; TRUNCATE TABLE test_table; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v4", "v2", "v3"]["UInt8", "UInt8", "UInt16"]["1", "2", "3"] +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v4", "v2", "v3"]["1", "2", "3"] SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes; +SELECT '---------'; +SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNames; SELECT 12; /* Check Nested */ INSERT INTO test_table_2 FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "n.id", "n.name"]["UInt8", "Array(UInt8)", "Array(String)"]["16", "[15, 16, 17]", "['first', 'second', 'third']"]; +INSERT INTO test_table_2 FORMAT JSONCompactStringsEachRowWithNames ["v1", "n.id", "n.name"]["16", "[15, 16, 17]", "['first', 'second', 'third']"]; SELECT * FROM test_table_2 FORMAT JSONCompactStringsEachRowWithNamesAndTypes; +SELECT '---------'; +SELECT * FROM test_table_2 FORMAT JSONCompactStringsEachRowWithNames; DROP TABLE IF EXISTS test_table; DROP TABLE IF EXISTS test_table_2; diff --git a/tests/queries/0_stateless/02026_storage_filelog_largefile.sh b/tests/queries/0_stateless/02026_storage_filelog_largefile.sh index 6babcc1e4f1..acd1c464334 100755 --- a/tests/queries/0_stateless/02026_storage_filelog_largefile.sh +++ b/tests/queries/0_stateless/02026_storage_filelog_largefile.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long +# Tags: long, no-parallel set -eu diff --git a/tests/queries/0_stateless/02097_json_strings_deserialization.reference b/tests/queries/0_stateless/02097_json_strings_deserialization.reference new file mode 100644 index 00000000000..8d7ffe54606 --- /dev/null +++ b/tests/queries/0_stateless/02097_json_strings_deserialization.reference @@ -0,0 +1,4 @@ +test\n\t\0\n test\n\t\0\n +test\n\t\0\n test\n\t\0\n +test\n\t\0\n test\n\t\0\n +test\n\t\0\n test\n\t\0\n diff --git a/tests/queries/0_stateless/02097_json_strings_deserialization.sh b/tests/queries/0_stateless/02097_json_strings_deserialization.sh new file mode 100755 index 00000000000..ae9e1ea7645 --- /dev/null +++ b/tests/queries/0_stateless/02097_json_strings_deserialization.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02097" +$CLICKHOUSE_CLIENT -q "create table test_02097 (s String, f FixedString(8)) engine=Memory()" +echo -e "('test\n\t\0\n', 'test\n\t\0\n')" | $CLICKHOUSE_CLIENT -q "insert into test_02097 format Values" +$CLICKHOUSE_CLIENT -q "select * from test_02097 format JSONStringsEachRow" | $CLICKHOUSE_CLIENT -q "insert into test_02097 format JSONStringsEachRow" +$CLICKHOUSE_CLIENT -q "select * from test_02097 format JSONCompactStringsEachRow" | $CLICKHOUSE_CLIENT -q "insert into test_02097 format JSONCompactStringsEachRow" +$CLICKHOUSE_CLIENT -q "select * from test_02097" +$CLICKHOUSE_CLIENT -q "drop table test_02097" + diff --git a/tests/queries/0_stateless/02098_with_types_use_header.reference b/tests/queries/0_stateless/02098_with_types_use_header.reference new file mode 100644 index 00000000000..c1d70452d1d --- /dev/null +++ b/tests/queries/0_stateless/02098_with_types_use_header.reference @@ -0,0 +1,16 @@ +TSVWithNamesAndTypes +OK +OK +OK +CSVWithNamesAndTypes +OK +OK +OK +JSONCompactEachRowWithNamesAndTypes +OK +OK +OK +JSONCompactStringsEachRowWithNamesAndTypes +OK +OK +OK diff --git a/tests/queries/0_stateless/02098_with_types_use_header.sh b/tests/queries/0_stateless/02098_with_types_use_header.sh new file mode 100755 index 00000000000..846696d18c0 --- /dev/null +++ b/tests/queries/0_stateless/02098_with_types_use_header.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02098" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02098 (x UInt32, y String, z Date) engine=Memory()" + +echo "TSVWithNamesAndTypes" +echo -e "x\ty\tz\nString\tDate\tUInt32\ntext\t2020-01-01\t1" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT TSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e "y\tz\tx\nString\tDate\tUInt32\ntext\t2020-01-01\t1" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT TSVWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e "x\tz\ty\nUInt32\tString\tDate\n1\ttext\t2020-01-01" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT TSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + + +echo "CSVWithNamesAndTypes" +echo -e "'x','y','z'\n'String','Date','UInt32'\n'text','2020-01-01',1" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT CSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e "'y','z','x'\n'String','Date','UInt32'\n'text','2020-01-01',1" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT CSVWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e "'x','z','y'\n'UInt32','String',Date'\n1,'text','2020-01-01'" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT CSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + + +echo "JSONCompactEachRowWithNamesAndTypes" +echo -e '["x","y","z"]\n["String","Date","UInt32"]\n["text","2020-01-01",1]' | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e '["y","z","x"]\n["String","Date","UInt32"]\n["text","2020-01-01",1]' | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactEachRowWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e '["x","z","y"]\n["UInt32", "String", "Date"]\n[1, "text","2020-01-01"]' | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + +echo "JSONCompactStringsEachRowWithNamesAndTypes" +echo -e '["x","y","z"]\n["String","Date","UInt32"]\n["text","2020-01-01","1"]' | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactStringsEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e '["y","z","x"]\n["String","Date","UInt32"]\n["text","2020-01-01","1"]' | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactStringsEachRowWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e '["x","z","y"]\n["UInt32", "String", "Date"]\n["1", "text","2020-01-01"]' | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactStringsEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02098" diff --git a/tests/queries/0_stateless/02099_tsv_raw_format.reference b/tests/queries/0_stateless/02099_tsv_raw_format.reference new file mode 100644 index 00000000000..de46cf8dff7 --- /dev/null +++ b/tests/queries/0_stateless/02099_tsv_raw_format.reference @@ -0,0 +1,113 @@ +TSVRaw +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TSVRawWithNames +number string date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TSVRawWithNamesAndTypes +number string date +UInt64 String Date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TabSeparatedRaw +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TabSeparatedRawWithNames +number string date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TabSeparatedRawWithNamesAndTypes +number string date +UInt64 String Date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +nSome text +b1cad4eb4be08a40387c9de70d02fcc2 - +b1cad4eb4be08a40387c9de70d02fcc2 - diff --git a/tests/queries/0_stateless/02099_tsv_raw_format.sh b/tests/queries/0_stateless/02099_tsv_raw_format.sh new file mode 100755 index 00000000000..16b695e4037 --- /dev/null +++ b/tests/queries/0_stateless/02099_tsv_raw_format.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02099 (number UInt64, string String, date Date) ENGINE=Memory()" + +FORMATS=('TSVRaw' 'TSVRawWithNames' 'TSVRawWithNamesAndTypes' 'TabSeparatedRaw' 'TabSeparatedRawWithNames' 'TabSeparatedRawWithNamesAndTypes') + +for format in "${FORMATS[@]}" +do + echo $format + $CLICKHOUSE_CLIENT -q "INSERT INTO test_02099 SELECT number, toString(number), toDate(number) FROM numbers(3)" + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02099 FORMAT $format" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02099 FORMAT $format" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_02099 FORMAT $format" + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02099" + + $CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02099" +done + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02099" + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_nullable_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_nullable_02099 ENGINE=Memory() AS SELECT number % 2 ? NULL : number from numbers(4)"; + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSVRaw" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099" + + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSV" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSVRaw" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_02099 FORMAT TSV" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099" + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_nullable_02099" + + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_nullable_string_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_nullable_string_02099 (s Nullable(String)) ENGINE=Memory()"; + +echo 'nSome text' | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_string_02099 FORMAT TSVRaw" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_string_02099" +$CLICKHOUSE_CLIENT -q "DROP TABLE test_nullable_string_02099" + + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_parallel_parsing_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_parallel_parsing_02099 (x UInt64, a Array(UInt64), s String) ENGINE=Memory()"; +$CLICKHOUSE_CLIENT -q "SELECT number AS x, range(number % 50) AS a, toString(a) AS s FROM numbers(1000000) FORMAT TSVRaw" | $CLICKHOUSE_CLIENT --input_format_parallel_parsing=0 -q "INSERT INTO test_parallel_parsing_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_parallel_parsing_02099 ORDER BY x" | md5sum + +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_parallel_parsing_02099" + +$CLICKHOUSE_CLIENT -q "SELECT number AS x, range(number % 50) AS a, toString(a) AS s FROM numbers(1000000) FORMAT TSVRaw" | $CLICKHOUSE_CLIENT --input_format_parallel_parsing=1 -q "INSERT INTO test_parallel_parsing_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_parallel_parsing_02099 ORDER BY x" | md5sum + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_parallel_parsing_02099" + diff --git a/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.reference b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.reference new file mode 100644 index 00000000000..12b4d6ad854 --- /dev/null +++ b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.reference @@ -0,0 +1,14 @@ +CSV +\N +TSV +\N +TSVRaw +\N +TSKV +\N +JSONCompactEachRow +\N +JSONEachRow +\N +Values +\N diff --git a/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.sh b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.sh new file mode 100755 index 00000000000..2fd1f130b7a --- /dev/null +++ b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02100" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02100 (x LowCardinality(Nullable(String)) DEFAULT 'default') ENGINE=Memory()" + +FORMATS=('CSV' 'TSV' 'TSVRaw' 'TSKV' 'JSONCompactEachRow' 'JSONEachRow' 'Values') + +for format in "${FORMATS[@]}" +do + echo $format + $CLICKHOUSE_CLIENT -q "SELECT NULL as x FORMAT $format" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_02100 FORMAT $format" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02100" + + $CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02100" +done + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02100" + diff --git a/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.reference b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.reference new file mode 100644 index 00000000000..61444c7a238 --- /dev/null +++ b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.reference @@ -0,0 +1,16 @@ +TSV +1 42 +2 0 +3 42 +4 0 +CSV +1 42 +2 0 +3 42 +4 0 +JSONEachRow +1 42 +2 0 +JSONCompactEachRow +1 42 +2 0 diff --git a/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.sh b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.sh new file mode 100755 index 00000000000..4f03c72cac3 --- /dev/null +++ b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02101" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02101 (x UInt64, y UInt64 DEFAULT 42) ENGINE=Memory()" + +echo 'TSV' +echo -e 'x\ty\n1\t' | $CLICKHOUSE_CLIENT --input_format_tsv_empty_as_default=1 --input_format_defaults_for_omitted_fields=1 -q "INSERT INTO test_02101 FORMAT TSVWithNames" +echo -e 'x\ty\n2\t' | $CLICKHOUSE_CLIENT --input_format_tsv_empty_as_default=1 --input_format_defaults_for_omitted_fields=0 -q "INSERT INTO test_02101 FORMAT TSVWithNames" +echo -e 'x\tz\n3\t123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT TSVWithNames" +echo -e 'x\tz\n4\t123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT TSVWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02101" + +echo 'CSV' +echo -e '"x","y"\n1,' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_defaults_for_omitted_fields=1 -q "INSERT INTO test_02101 FORMAT CSVWithNames" +echo -e '"x","y"\n2,' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_defaults_for_omitted_fields=0 -q "INSERT INTO test_02101 FORMAT CSVWithNames" +echo -e '"x","z"\n3,123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT CSVWithNames" +echo -e '"x","z"\n4,123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT CSVWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02101" + +echo 'JSONEachRow' +echo -e '{"x" : 1, "z" : 123}' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONEachRow" +echo -e '{"x" : 2, "z" : 123}' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONEachRow" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02101" + +echo 'JSONCompactEachRow' +echo -e '["x", "z"], [1, 123]' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONCompactEachRowWithNames" +echo -e '["x", "z"], [2, 123]' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONCompactEachRowWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02101" + diff --git a/tests/queries/0_stateless/02102_row_binary_with_names_and_types.reference b/tests/queries/0_stateless/02102_row_binary_with_names_and_types.reference new file mode 100644 index 00000000000..9011f20cd6a --- /dev/null +++ b/tests/queries/0_stateless/02102_row_binary_with_names_and_types.reference @@ -0,0 +1,14 @@ +1 text 2020-01-01 +1 text 2020-01-01 +1 text 2020-01-01 +1 text 2020-01-01 +1 text 2020-01-01 +1 text 2020-01-01 +1 default 1970-01-01 +1 default 1970-01-01 +1 1970-01-01 +1 1970-01-01 +OK +1 default 1970-01-01 +OK +OK diff --git a/tests/queries/0_stateless/02102_row_binary_with_names_and_types.sh b/tests/queries/0_stateless/02102_row_binary_with_names_and_types.sh new file mode 100755 index 00000000000..e7307ad3ad5 --- /dev/null +++ b/tests/queries/0_stateless/02102_row_binary_with_names_and_types.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02102" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02102 (x UInt32, y String DEFAULT 'default', z Date) engine=Memory()" + + + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, 'text' AS y, toDate('2020-01-01') AS z FORMAT RowBinaryWithNames" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, 'text' AS y, toDate('2020-01-01') AS z FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, 'text' AS y, toDate('2020-01-01') AS z FORMAT RowBinaryWithNames" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=0 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, 'text' AS y, toDate('2020-01-01') AS z FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=0 --input_format_with_types_use_header=0 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + + +$CLICKHOUSE_CLIENT -q "SELECT 'text' AS y, toDate('2020-01-01') AS z, toUInt32(1) AS x FORMAT RowBinaryWithNames" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + +$CLICKHOUSE_CLIENT -q "SELECT 'text' AS y, toDate('2020-01-01') AS z, toUInt32(1) AS x FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x FORMAT RowBinaryWithNames" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x FORMAT RowBinaryWithNames" | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_with_names_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, [[1, 2, 3], [4, 5], []] as a FORMAT RowBinaryWithNames" | $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_with_names_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames" 2>&1 | grep -F -q "CANNOT_SKIP_UNKNOWN_FIELD" && echo 'OK' || echo 'FAIL' + + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, [[1, 2, 3], [4, 5], []] as a FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102" + + +$CLICKHOUSE_CLIENT -q "SELECT 'text' AS x, toDate('2020-01-01') AS y, toUInt32(1) AS z FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, 'text' as z, toDate('2020-01-01') AS y FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02102" + diff --git a/tests/queries/0_stateless/02103_with_names_and_types_parallel_parsing.reference b/tests/queries/0_stateless/02103_with_names_and_types_parallel_parsing.reference new file mode 100644 index 00000000000..962b233e5e7 --- /dev/null +++ b/tests/queries/0_stateless/02103_with_names_and_types_parallel_parsing.reference @@ -0,0 +1,80 @@ +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 +0 [0,1,2,3,4,5,6,7,8,9] 0 +1 [0,1,2,3,4,5,6,7,8,9,10] 1 +2 [0,1,2,3,4,5,6,7,8,9,10,11] 2 +3 [0,1,2,3,4,5,6,7,8,9,10,11,12] 3 +4 [0,1,2,3,4,5,6,7,8,9,10,11,12,13] 4 +5 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 5 +6 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 6 +7 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] 7 +8 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 8 +9 [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] 9 diff --git a/tests/queries/0_stateless/02103_with_names_and_types_parallel_parsing.sh b/tests/queries/0_stateless/02103_with_names_and_types_parallel_parsing.sh new file mode 100755 index 00000000000..487282099e2 --- /dev/null +++ b/tests/queries/0_stateless/02103_with_names_and_types_parallel_parsing.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +DATA_FILE=$USER_FILES_PATH/test_02103.data + +FORMATS=('TSVWithNames' 'TSVWithNamesAndTypes' 'TSVRawWithNames' 'TSVRawWithNamesAndTypes' 'CSVWithNames' 'CSVWithNamesAndTypes' 'JSONCompactEachRowWithNames' 'JSONCompactEachRowWithNamesAndTypes') + +for format in "${FORMATS[@]}" +do + $CLICKHOUSE_CLIENT -q "SELECT number, range(number + 10) AS array, toString(number) AS string FROM numbers(10) FORMAT $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103.data', '$format', 'number UInt64, array Array(UInt64), string String') SETTINGS input_format_parallel_parsing=1, min_chunk_bytes_for_parallel_parsing=40" +done + +rm $DATA_FILE + diff --git a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference index 6d663c33057..7ad5359a30e 100644 --- a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference +++ b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference @@ -1,12 +1,28 @@ JSONEachRow, false -7251839681e559f5a92db107571bb357 - +e0a3c9978a92a277f2fff4664f3c1749 - JSONEachRow, true -7251839681e559f5a92db107571bb357 - +e0a3c9978a92a277f2fff4664f3c1749 - JSONCompactEachRow, false -ba1081a754a06ef6563840b2d8d4d327 - +0c1efbbc25a5bd90a2ecea559d283667 - JSONCompactEachRow, true -ba1081a754a06ef6563840b2d8d4d327 - +0c1efbbc25a5bd90a2ecea559d283667 - +JSONCompactStringsEachRow, false +0c1efbbc25a5bd90a2ecea559d283667 - +JSONCompactStringsEachRow, true +0c1efbbc25a5bd90a2ecea559d283667 - +JSONCompactEachRowWithNames, false +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactEachRowWithNames, true +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactStringsEachRowWithNames, false +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactStringsEachRowWithNames, true +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactEachRowWithNamesAndTypes, false +8b41f7375999b53d4c9607398456fe5b - +JSONCompactEachRowWithNamesAndTypes, true +8b41f7375999b53d4c9607398456fe5b - JSONCompactStringsEachRowWithNamesAndTypes, false -31ded3cd9971b124450fb5a44a8bce63 - +8b41f7375999b53d4c9607398456fe5b - JSONCompactStringsEachRowWithNamesAndTypes, true -31ded3cd9971b124450fb5a44a8bce63 - +8b41f7375999b53d4c9607398456fe5b - diff --git a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh index 5d54328e45d..f6c87eabfde 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh @@ -6,15 +6,15 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh -FORMATS=('JSONEachRow' 'JSONCompactEachRow' 'JSONCompactStringsEachRowWithNamesAndTypes') +FORMATS=('JSONEachRow' 'JSONCompactEachRow' 'JSONCompactStringsEachRow' 'JSONCompactEachRowWithNames' 'JSONCompactStringsEachRowWithNames' 'JSONCompactEachRowWithNamesAndTypes' 'JSONCompactStringsEachRowWithNamesAndTypes') for format in "${FORMATS[@]}" do echo "$format, false"; $CLICKHOUSE_CLIENT --output_format_parallel_formatting=false -q \ - "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c Format $format" | md5sum + "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c LIMIT 3000000 Format $format" | md5sum echo "$format, true"; $CLICKHOUSE_CLIENT --output_format_parallel_formatting=true -q \ - "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c Format $format" | md5sum + "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c LIMIT 3000000 Format $format" | md5sum done diff --git a/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.reference b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.reference new file mode 100644 index 00000000000..0c0367694b2 --- /dev/null +++ b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.reference @@ -0,0 +1,20 @@ +TSVWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +TSVWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - +CSVWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +CSVWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - +JSONStringsEachRow, false +7c1feeaae418e502d66fcc8e31946f2e - +JSONStringsEachRow, true +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactEachRowWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactEachRowWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactStringsEachRowWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactStringsEachRowWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - diff --git a/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.sh b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.sh new file mode 100755 index 00000000000..9fdca20d097 --- /dev/null +++ b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +FORMATS=('TSVWithNamesAndTypes' 'CSVWithNamesAndTypes' 'JSONStringsEachRow' 'JSONCompactEachRowWithNamesAndTypes' 'JSONCompactStringsEachRowWithNamesAndTypes') +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parsing_with_names" + +for format in "${FORMATS[@]}" +do + # Columns are permuted + $CLICKHOUSE_CLIENT -q "CREATE TABLE parsing_with_names(c FixedString(16), a DateTime('Europe/Moscow'), b String) ENGINE=Memory()" + + echo "$format, false"; + $CLICKHOUSE_CLIENT --output_format_parallel_formatting=false -q \ + "SELECT URLRegions as d, toTimeZone(ClientEventTime, 'Europe/Moscow') as a, MobilePhoneModel as b, ParamPrice as e, ClientIP6 as c FROM test.hits LIMIT 5000 Format $format" | \ + $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_parallel_parsing=false -q "INSERT INTO parsing_with_names FORMAT $format SETTINGS input_format_null_as_default=0" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM parsing_with_names;" | md5sum + $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parsing_with_names" + + + $CLICKHOUSE_CLIENT -q "CREATE TABLE parsing_with_names(c FixedString(16), a DateTime('Europe/Moscow'), b String) ENGINE=Memory()" + echo "$format, true"; + $CLICKHOUSE_CLIENT --output_format_parallel_formatting=false -q \ + "SELECT URLRegions as d, toTimeZone(ClientEventTime, 'Europe/Moscow') as a, MobilePhoneModel as b, ParamPrice as e, ClientIP6 as c FROM test.hits LIMIT 5000 Format $format" | \ + $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_parallel_parsing=true -q "INSERT INTO parsing_with_names FORMAT $format SETTINGS input_format_null_as_default=0" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM parsing_with_names;" | md5sum + $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parsing_with_names" +done