Merge branch 'master' into change-server-memory-usage-without-restart

This commit is contained in:
Alexey Milovidov 2021-01-18 00:00:09 +03:00
commit 712bef8eef
146 changed files with 5020 additions and 582 deletions

6
.gitmodules vendored
View File

@ -209,6 +209,12 @@
[submodule "contrib/fast_float"]
path = contrib/fast_float
url = https://github.com/fastfloat/fast_float
[submodule "contrib/libpqxx"]
path = contrib/libpqxx
url = https://github.com/jtv/libpqxx
[submodule "contrib/libpq"]
path = contrib/libpq
url = https://github.com/ClickHouse-Extras/libpq
[submodule "contrib/boringssl"]
path = contrib/boringssl
url = https://github.com/ClickHouse-Extras/boringssl.git

View File

@ -490,6 +490,7 @@ include (cmake/find/rapidjson.cmake)
include (cmake/find/fastops.cmake)
include (cmake/find/odbc.cmake)
include (cmake/find/rocksdb.cmake)
include (cmake/find/libpqxx.cmake)
include (cmake/find/nuraft.cmake)

31
cmake/find/libpqxx.cmake Normal file
View File

@ -0,0 +1,31 @@
option(ENABLE_LIBPQXX "Enalbe libpqxx" ${ENABLE_LIBRARIES})
if (NOT ENABLE_LIBPQXX)
return()
endif()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libpqxx/CMakeLists.txt")
message (WARNING "submodule contrib/libpqxx is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal libpqxx library")
set (USE_LIBPQXX 0)
return()
endif()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libpq/include")
message (ERROR "submodule contrib/libpq is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal libpq needed for libpqxx")
set (USE_LIBPQXX 0)
return()
endif()
if (NOT USE_INTERNAL_SSL_LIBRARY)
set (USE_LIBPQXX 0)
else ()
set (USE_LIBPQXX 1)
set (LIBPQXX_LIBRARY libpqxx)
set (LIBPQ_LIBRARY libpq)
set (LIBPQXX_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libpqxx/include")
set (LIBPQ_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/libpq")
message (STATUS "Using libpqxx=${USE_LIBPQXX}: ${LIBPQXX_INCLUDE_DIR} : ${LIBPQXX_LIBRARY}")
message (STATUS "Using libpq: ${LIBPQ_ROOT_DIR} : ${LIBPQ_INCLUDE_DIR} : ${LIBPQ_LIBRARY}")
endif()

View File

@ -310,6 +310,11 @@ if (USE_INTERNAL_ROCKSDB_LIBRARY)
add_subdirectory(rocksdb-cmake)
endif()
if (USE_LIBPQXX)
add_subdirectory (libpq-cmake)
add_subdirectory (libpqxx-cmake)
endif()
if (USE_NURAFT)
add_subdirectory(nuraft-cmake)
endif()

1
contrib/libpq vendored Submodule

@ -0,0 +1 @@
Subproject commit 8e7e905854714a7fbb49c124dbc45c7bd4b98e07

View File

@ -0,0 +1,58 @@
set(LIBPQ_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/libpq)
set(SRCS
${LIBPQ_SOURCE_DIR}/fe-auth.c
${LIBPQ_SOURCE_DIR}/fe-auth-scram.c
${LIBPQ_SOURCE_DIR}/fe-connect.c
${LIBPQ_SOURCE_DIR}/fe-exec.c
${LIBPQ_SOURCE_DIR}/fe-lobj.c
${LIBPQ_SOURCE_DIR}/fe-misc.c
${LIBPQ_SOURCE_DIR}/fe-print.c
${LIBPQ_SOURCE_DIR}/fe-protocol2.c
${LIBPQ_SOURCE_DIR}/fe-protocol3.c
${LIBPQ_SOURCE_DIR}/fe-secure.c
${LIBPQ_SOURCE_DIR}/fe-secure-common.c
${LIBPQ_SOURCE_DIR}/fe-secure-openssl.c
${LIBPQ_SOURCE_DIR}/legacy-pqsignal.c
${LIBPQ_SOURCE_DIR}/libpq-events.c
${LIBPQ_SOURCE_DIR}/pqexpbuffer.c
${LIBPQ_SOURCE_DIR}/common/scram-common.c
${LIBPQ_SOURCE_DIR}/common/sha2_openssl.c
${LIBPQ_SOURCE_DIR}/common/md5.c
${LIBPQ_SOURCE_DIR}/common/saslprep.c
${LIBPQ_SOURCE_DIR}/common/unicode_norm.c
${LIBPQ_SOURCE_DIR}/common/ip.c
${LIBPQ_SOURCE_DIR}/common/jsonapi.c
${LIBPQ_SOURCE_DIR}/common/wchar.c
${LIBPQ_SOURCE_DIR}/common/base64.c
${LIBPQ_SOURCE_DIR}/common/link-canary.c
${LIBPQ_SOURCE_DIR}/common/fe_memutils.c
${LIBPQ_SOURCE_DIR}/common/string.c
${LIBPQ_SOURCE_DIR}/common/pg_get_line.c
${LIBPQ_SOURCE_DIR}/common/stringinfo.c
${LIBPQ_SOURCE_DIR}/common/psprintf.c
${LIBPQ_SOURCE_DIR}/common/encnames.c
${LIBPQ_SOURCE_DIR}/common/logging.c
${LIBPQ_SOURCE_DIR}/port/snprintf.c
${LIBPQ_SOURCE_DIR}/port/strlcpy.c
${LIBPQ_SOURCE_DIR}/port/strerror.c
${LIBPQ_SOURCE_DIR}/port/inet_net_ntop.c
${LIBPQ_SOURCE_DIR}/port/getpeereid.c
${LIBPQ_SOURCE_DIR}/port/chklocale.c
${LIBPQ_SOURCE_DIR}/port/noblock.c
${LIBPQ_SOURCE_DIR}/port/pg_strong_random.c
${LIBPQ_SOURCE_DIR}/port/pgstrcasecmp.c
${LIBPQ_SOURCE_DIR}/port/thread.c
${LIBPQ_SOURCE_DIR}/port/path.c
${LIBPQ_SOURCE_DIR}/port/explicit_bzero.c
)
add_library(libpq ${SRCS})
target_include_directories (libpq PUBLIC ${LIBPQ_SOURCE_DIR})
target_include_directories (libpq PUBLIC ${LIBPQ_SOURCE_DIR}/include)
target_include_directories (libpq PRIVATE ${LIBPQ_SOURCE_DIR}/configs)
target_link_libraries (libpq PRIVATE ssl)

1
contrib/libpqxx vendored Submodule

@ -0,0 +1 @@
Subproject commit 58d2a028d1600225ac3a478d6b3a06ba2f0c01f6

View File

@ -0,0 +1,78 @@
set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/libpqxx)
set (SRCS
${LIBRARY_DIR}/src/strconv.cxx
${LIBRARY_DIR}/src/array.cxx
${LIBRARY_DIR}/src/binarystring.cxx
${LIBRARY_DIR}/src/connection.cxx
${LIBRARY_DIR}/src/cursor.cxx
${LIBRARY_DIR}/src/encodings.cxx
${LIBRARY_DIR}/src/errorhandler.cxx
${LIBRARY_DIR}/src/except.cxx
${LIBRARY_DIR}/src/field.cxx
${LIBRARY_DIR}/src/largeobject.cxx
${LIBRARY_DIR}/src/notification.cxx
${LIBRARY_DIR}/src/pipeline.cxx
${LIBRARY_DIR}/src/result.cxx
${LIBRARY_DIR}/src/robusttransaction.cxx
${LIBRARY_DIR}/src/sql_cursor.cxx
${LIBRARY_DIR}/src/stream_from.cxx
${LIBRARY_DIR}/src/stream_to.cxx
${LIBRARY_DIR}/src/subtransaction.cxx
${LIBRARY_DIR}/src/transaction.cxx
${LIBRARY_DIR}/src/transaction_base.cxx
${LIBRARY_DIR}/src/row.cxx
${LIBRARY_DIR}/src/util.cxx
${LIBRARY_DIR}/src/version.cxx
)
# Need to explicitly include each header file, because in the directory include/pqxx there are also files
# like just 'array'. So if including the whole directory with `target_include_directories`, it will make
# conflicts with all includes of <array>.
set (HDRS
${LIBRARY_DIR}/include/pqxx/array.hxx
${LIBRARY_DIR}/include/pqxx/binarystring.hxx
${LIBRARY_DIR}/include/pqxx/composite.hxx
${LIBRARY_DIR}/include/pqxx/connection.hxx
${LIBRARY_DIR}/include/pqxx/cursor.hxx
${LIBRARY_DIR}/include/pqxx/dbtransaction.hxx
${LIBRARY_DIR}/include/pqxx/errorhandler.hxx
${LIBRARY_DIR}/include/pqxx/except.hxx
${LIBRARY_DIR}/include/pqxx/field.hxx
${LIBRARY_DIR}/include/pqxx/isolation.hxx
${LIBRARY_DIR}/include/pqxx/largeobject.hxx
${LIBRARY_DIR}/include/pqxx/nontransaction.hxx
${LIBRARY_DIR}/include/pqxx/notification.hxx
${LIBRARY_DIR}/include/pqxx/pipeline.hxx
${LIBRARY_DIR}/include/pqxx/prepared_statement.hxx
${LIBRARY_DIR}/include/pqxx/result.hxx
${LIBRARY_DIR}/include/pqxx/robusttransaction.hxx
${LIBRARY_DIR}/include/pqxx/row.hxx
${LIBRARY_DIR}/include/pqxx/separated_list.hxx
${LIBRARY_DIR}/include/pqxx/strconv.hxx
${LIBRARY_DIR}/include/pqxx/stream_from.hxx
${LIBRARY_DIR}/include/pqxx/stream_to.hxx
${LIBRARY_DIR}/include/pqxx/subtransaction.hxx
${LIBRARY_DIR}/include/pqxx/transaction.hxx
${LIBRARY_DIR}/include/pqxx/transaction_base.hxx
${LIBRARY_DIR}/include/pqxx/types.hxx
${LIBRARY_DIR}/include/pqxx/util.hxx
${LIBRARY_DIR}/include/pqxx/version.hxx
${LIBRARY_DIR}/include/pqxx/zview.hxx
)
add_library(libpqxx ${SRCS} ${HDRS})
target_link_libraries(libpqxx PUBLIC ${LIBPQ_LIBRARY})
target_include_directories (libpqxx PRIVATE ${LIBRARY_DIR}/include)
# crutch
set(CM_CONFIG_H_IN "${LIBRARY_DIR}/include/pqxx/config.h.in")
set(CM_CONFIG_PUB "${LIBRARY_DIR}/include/pqxx/config-public-compiler.h")
set(CM_CONFIG_INT "${LIBRARY_DIR}/include/pqxx/config-internal-compiler.h")
set(CM_CONFIG_PQ "${LIBRARY_DIR}/include/pqxx/config-internal-libpq.h")
configure_file("${CM_CONFIG_H_IN}" "${CM_CONFIG_INT}" @ONLY)
configure_file("${CM_CONFIG_H_IN}" "${CM_CONFIG_PUB}" @ONLY)
configure_file("${CM_CONFIG_H_IN}" "${CM_CONFIG_PQ}" @ONLY)

View File

@ -8,6 +8,7 @@ stage=${stage:-}
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
echo "$script_dir"
repo_dir=ch
BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-11_debug_none_bundled_unsplitted_disable_False_binary"}
function clone
{
@ -35,7 +36,7 @@ function download
# wget -O- -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/performance/performance.tgz" \
# | tar --strip-components=1 -zxv
wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-11_debug_none_bundled_unsplitted_disable_False_binary/clickhouse"
wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"
chmod +x clickhouse
ln -s ./clickhouse ./clickhouse-server
ln -s ./clickhouse ./clickhouse-client

View File

@ -538,11 +538,11 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCas
!!! note "Note"
For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one this works for Latin and mostly for all Cyrillic letters.
## countSubstrings(haystack, needle) {#countSubstrings}
## countSubstrings {#countSubstrings}
Count the number of substring occurrences
Returns the number of substring occurrences.
For a case-insensitive search, use the function `countSubstringsCaseInsensitive` (or `countSubstringsCaseInsensitiveUTF8`).
For a case-insensitive search, use [countSubstringsCaseInsensitive](../../sql-reference/functions/string-search-functions.md#countSubstringsCaseInsensitive) or [countSubstringsCaseInsensitiveUTF8](../../sql-reference/functions/string-search-functions.md#countSubstringsCaseInsensitiveUTF8) functions.
**Syntax**
@ -554,20 +554,20 @@ countSubstrings(haystack, needle[, start_pos])
- `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `start_pos` Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md)
- `start_pos` Position of the first character in the string to start search. Optional. [UInt](../../sql-reference/data-types/int-uint.md).
**Returned values**
- Number of occurrences.
Type: `Integer`.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Examples**
Query:
``` sql
SELECT countSubstrings('foobar.com', '.')
SELECT countSubstrings('foobar.com', '.');
```
Result:
@ -581,7 +581,7 @@ Result:
Query:
``` sql
SELECT countSubstrings('aaaa', 'aa')
SELECT countSubstrings('aaaa', 'aa');
```
Result:
@ -592,6 +592,138 @@ Result:
└───────────────────────────────┘
```
Query:
```sql
SELECT countSubstrings('abc___abc', 'abc', 4);
```
Result:
``` text
┌─countSubstrings('abc___abc', 'abc', 4)─┐
│ 1 │
└────────────────────────────────────────┘
```
## countSubstringsCaseInsensitive {#countSubstringsCaseInsensitive}
Returns the number of substring occurrences case-insensitive.
**Syntax**
``` sql
countSubstringsCaseInsensitive(haystack, needle[, start_pos])
```
**Parameters**
- `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `start_pos` Position of the first character in the string to start search. Optional. [UInt](../../sql-reference/data-types/int-uint.md).
**Returned values**
- Number of occurrences.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Examples**
Query:
``` sql
select countSubstringsCaseInsensitive('aba', 'B');
```
Result:
``` text
┌─countSubstringsCaseInsensitive('aba', 'B')─┐
│ 1 │
└────────────────────────────────────────────┘
```
Query:
``` sql
SELECT countSubstringsCaseInsensitive('foobar.com', 'CoM');
```
Result:
``` text
┌─countSubstringsCaseInsensitive('foobar.com', 'CoM')─┐
│ 1 │
└─────────────────────────────────────────────────────┘
```
Query:
``` sql
SELECT countSubstringsCaseInsensitive('abC___abC', 'aBc', 2);
```
Result:
``` text
┌─countSubstringsCaseInsensitive('abC___abC', 'aBc', 2)─┐
│ 1 │
└───────────────────────────────────────────────────────┘
```
## countSubstringsCaseInsensitiveUTF8 {#countSubstringsCaseInsensitiveUTF8}
Returns the number of substring occurrences in `UTF-8` case-insensitive.
**Syntax**
``` sql
SELECT countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos])
```
**Parameters**
- `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `start_pos` Position of the first character in the string to start search. Optional. [UInt](../../sql-reference/data-types/int-uint.md).
**Returned values**
- Number of occurrences.
Type: [UInt64](../../sql-reference/data-types/int-uint.md).
**Examples**
Query:
``` sql
SELECT countSubstringsCaseInsensitiveUTF8('абв', 'A');
```
Result:
``` text
┌─countSubstringsCaseInsensitiveUTF8('абв', 'A')─┐
│ 1 │
└────────────────────────────────────────────────┘
```
Query:
```sql
SELECT countSubstringsCaseInsensitiveUTF8('аБв__АбВ__абв', 'Абв');
```
Result:
``` text
┌─countSubstringsCaseInsensitiveUTF8('аБв__АбВ__абв', 'Абв')─┐
│ 3 │
└────────────────────────────────────────────────────────────┘
```
## countMatches(haystack, pattern) {#countmatcheshaystack-pattern}
Returns the number of regular expression matches for a `pattern` in a `haystack`.

View File

@ -573,4 +573,190 @@ SELECT countMatches('aaaa', 'aa');
└───────────────────────────────┘
```
## countSubstrings {#countSubstrings}
Возвращает количество вхождений подстроки.
Для поиска без учета регистра, используйте функции [countSubstringsCaseInsensitive](../../sql-reference/functions/string-search-functions.md#countSubstringsCaseInsensitive) или [countSubstringsCaseInsensitiveUTF8](../../sql-reference/functions/string-search-functions.md#countSubstringsCaseInsensitiveUTF8)
**Синтаксис**
``` sql
countSubstrings(haystack, needle[, start_pos])
```
**Параметры**
- `haystack` — строка, в которой ведется поиск. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `needle` — искомая подстрока. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `start_pos` позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md).
**Возвращаемые значения**
- Число вхождений.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Примеры**
Запрос:
``` sql
SELECT countSubstrings('foobar.com', '.');
```
Результат:
``` text
┌─countSubstrings('foobar.com', '.')─┐
│ 1 │
└────────────────────────────────────┘
```
Запрос:
``` sql
SELECT countSubstrings('aaaa', 'aa');
```
Результат:
``` text
┌─countSubstrings('aaaa', 'aa')─┐
│ 2 │
└───────────────────────────────┘
```
Запрос:
```sql
SELECT countSubstrings('abc___abc', 'abc', 4);
```
Результат:
``` text
┌─countSubstrings('abc___abc', 'abc', 4)─┐
│ 1 │
└────────────────────────────────────────┘
```
## countSubstringsCaseInsensitive {#countSubstringsCaseInsensitive}
Возвращает количество вхождений подстроки без учета регистра.
**Синтаксис**
``` sql
countSubstringsCaseInsensitive(haystack, needle[, start_pos])
```
**Параметры**
- `haystack` — строка, в которой ведется поиск. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `needle` — искомая подстрока. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `start_pos` позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md).
**Возвращаемые значения**
- Число вхождений.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Примеры**
Запрос:
``` sql
select countSubstringsCaseInsensitive('aba', 'B');
```
Результат:
``` text
┌─countSubstringsCaseInsensitive('aba', 'B')─┐
│ 1 │
└────────────────────────────────────────────┘
```
Запрос:
``` sql
SELECT countSubstringsCaseInsensitive('foobar.com', 'CoM');
```
Результат:
``` text
┌─countSubstringsCaseInsensitive('foobar.com', 'CoM')─┐
│ 1 │
└─────────────────────────────────────────────────────┘
```
Запрос:
``` sql
SELECT countSubstringsCaseInsensitive('abC___abC', 'aBc', 2);
```
Результат:
``` text
┌─countSubstringsCaseInsensitive('abC___abC', 'aBc', 2)─┐
│ 1 │
└───────────────────────────────────────────────────────┘
```
## countSubstringsCaseInsensitiveUTF8 {#countSubstringsCaseInsensitiveUTF8}
Возвращает количество вхождений подстроки в `UTF-8` без учета регистра.
**Синтаксис**
``` sql
SELECT countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos])
```
**Параметры**
- `haystack` — строка, в которой ведется поиск. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `needle` — искомая подстрока. [String](../../sql-reference/syntax.md#syntax-string-literal).
- `start_pos` позиция первого символа в строке, с которого начнется поиск. Необязательный параметр. [UInt](../../sql-reference/data-types/int-uint.md).
**Возвращаемые значения**
- Число вхождений.
Тип: [UInt64](../../sql-reference/data-types/int-uint.md).
**Примеры**
Запрос:
``` sql
SELECT countSubstringsCaseInsensitiveUTF8('абв', 'A');
```
Результат:
``` text
┌─countSubstringsCaseInsensitiveUTF8('абв', 'A')─┐
│ 1 │
└────────────────────────────────────────────────┘
```
Запрос:
```sql
SELECT countSubstringsCaseInsensitiveUTF8('аБв__АбВ__абв', 'Абв');
```
Результат:
``` text
┌─countSubstringsCaseInsensitiveUTF8('аБв__АбВ__абв', 'Абв')─┐
│ 3 │
└────────────────────────────────────────────────────────────┘
```
[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_search_functions/) <!--hide-->

View File

@ -1,6 +1,6 @@
## INSERT {#insert}
## INSERT INTO 语句 {#insert}
INSERT查询主要用于向系统中添加数据.
INSERT INTO 语句主要用于向系统中添加数据.
查询的基本格式:
@ -8,7 +8,52 @@ INSERT查询主要用于向系统中添加数据.
INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ...
```
您可以在查询中指定插入的列的列表,如:`[(c1, c2, c3)]`。对于存在于表结构中但不存在于插入列表中的列,它们将会按照如下方式填充数据:
您可以在查询中指定要插入的列的列表,如:`[(c1, c2, c3)]`。您还可以使用列[匹配器](../../sql-reference/statements/select/index.md#asterisk)的表达式,例如`*`和/或[修饰符](../../sql-reference/statements/select/index.md#select-modifiers),例如 [APPLY](../../sql-reference/statements/select/index.md#apply-modifier) [EXCEPT](../../sql-reference/statements/select/index.md#apply-modifier) [REPLACE](../../sql-reference/statements/select/index.md#replace-modifier)。
例如,考虑该表:
``` sql
SHOW CREATE insert_select_testtable;
```
```text
CREATE TABLE insert_select_testtable
(
`a` Int8,
`b` String,
`c` Int8
)
ENGINE = MergeTree()
ORDER BY a
SETTINGS index_granularity = 8192
```
``` sql
INSERT INTO insert_select_testtable (*) VALUES (1, 'a', 1) ;
```
如果要在除了'b'列以外的所有列中插入数据,您需要传递和括号中选择的列数一样多的值:
``` sql
INSERT INTO insert_select_testtable (* EXCEPT(b)) Values (2, 2);
```
``` sql
SELECT * FROM insert_select_testtable;
```
```
┌─a─┬─b─┬─c─┐
│ 2 │ │ 2 │
└───┴───┴───┘
┌─a─┬─b─┬─c─┐
│ 1 │ a │ 1 │
└───┴───┴───┘
```
在这个示例中,我们看到插入的第二行的`a`和`c`列的值由传递的值填充,而`b`列由默认值填充。
对于存在于表结构中但不存在于插入列表中的列,它们将会按照如下方式填充数据:
- 如果存在`DEFAULT`表达式,根据`DEFAULT`表达式计算被填充的值。
- 如果没有定义`DEFAULT`表达式,则填充零或空字符串。

View File

@ -0,0 +1,17 @@
# ALL 子句 {#select-all}
`SELECT ALL``SELECT` 不带 `DISTINCT` 是一样的。
- 如果指定了 `ALL` ,则忽略它。
- 如果同时指定了 `ALL``DISTINCT` ,则会抛出异常。
`ALL` 也可以在聚合函数中指定,具有相同的效果(空操作)。例如:
```sql
SELECT sum(ALL number) FROM numbers(10);
```
等于
```sql
SELECT sum(number) FROM numbers(10);
```

View File

@ -159,6 +159,7 @@ enum class AccessType
M(REMOTE, "", GLOBAL, SOURCES) \
M(MONGO, "", GLOBAL, SOURCES) \
M(MYSQL, "", GLOBAL, SOURCES) \
M(POSTGRES, "", GLOBAL, SOURCES) \
M(ODBC, "", GLOBAL, SOURCES) \
M(JDBC, "", GLOBAL, SOURCES) \
M(HDFS, "", GLOBAL, SOURCES) \

View File

@ -79,6 +79,11 @@ if (USE_AMQPCPP)
add_headers_and_sources(dbms Storages/RabbitMQ)
endif()
if (USE_LIBPQXX)
add_headers_and_sources(dbms Databases/PostgreSQL)
add_headers_and_sources(dbms Storages/PostgreSQL)
endif()
if (USE_ROCKSDB)
add_headers_and_sources(dbms Storages/RocksDB)
endif()
@ -439,6 +444,11 @@ if (USE_ROCKSDB)
dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ROCKSDB_INCLUDE_DIR})
endif()
if (USE_LIBPQXX)
dbms_target_link_libraries(PUBLIC ${LIBPQXX_LIBRARY})
dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${LIBPQXX_INCLUDE_DIR})
endif()
dbms_target_link_libraries(PRIVATE _boost_context)
if (ENABLE_TESTS AND USE_GTEST)

View File

@ -86,6 +86,9 @@ public:
const ColumnArray & getNestedColumn() const { return assert_cast<const ColumnArray &>(*nested); }
ColumnArray & getNestedColumn() { return assert_cast<ColumnArray &>(*nested); }
const ColumnPtr & getNestedColumnPtr() const { return nested; }
ColumnPtr & getNestedColumnPtr() { return nested; }
const ColumnTuple & getNestedData() const { return assert_cast<const ColumnTuple &>(getNestedColumn().getData()); }
ColumnTuple & getNestedData() { return assert_cast<ColumnTuple &>(getNestedColumn().getData()); }
};

View File

@ -143,9 +143,11 @@ public:
const IColumn & getNestedColumn() const { return *nested_column; }
const ColumnPtr & getNestedColumnPtr() const { return nested_column; }
ColumnPtr & getNestedColumnPtr() { return nested_column; }
/// Return the column that represents the byte map.
const ColumnPtr & getNullMapColumnPtr() const { return null_map; }
ColumnPtr & getNullMapColumnPtr() { return null_map; }
ColumnUInt8 & getNullMapColumn() { return assert_cast<ColumnUInt8 &>(*null_map); }
const ColumnUInt8 & getNullMapColumn() const { return assert_cast<const ColumnUInt8 &>(*null_map); }

View File

@ -99,6 +99,7 @@ public:
Columns getColumnsCopy() const { return {columns.begin(), columns.end()}; }
const ColumnPtr & getColumnPtr(size_t idx) const { return columns[idx]; }
ColumnPtr & getColumnPtr(size_t idx) { return columns[idx]; }
private:
int compareAtImpl(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator * collator=nullptr) const;

View File

@ -4,6 +4,7 @@
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFixedString.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypesDecimal.h>
@ -35,49 +36,53 @@ void ExternalResultDescription::init(const Block & sample_block_)
DataTypePtr type_not_nullable = removeNullable(elem.type);
const IDataType * type = type_not_nullable.get();
if (typeid_cast<const DataTypeUInt8 *>(type))
WhichDataType which(type);
if (which.isUInt8())
types.emplace_back(ValueType::vtUInt8, is_nullable);
else if (typeid_cast<const DataTypeUInt16 *>(type))
else if (which.isUInt16())
types.emplace_back(ValueType::vtUInt16, is_nullable);
else if (typeid_cast<const DataTypeUInt32 *>(type))
else if (which.isUInt32())
types.emplace_back(ValueType::vtUInt32, is_nullable);
else if (typeid_cast<const DataTypeUInt64 *>(type))
else if (which.isUInt64())
types.emplace_back(ValueType::vtUInt64, is_nullable);
else if (typeid_cast<const DataTypeInt8 *>(type))
else if (which.isInt8())
types.emplace_back(ValueType::vtInt8, is_nullable);
else if (typeid_cast<const DataTypeInt16 *>(type))
else if (which.isInt16())
types.emplace_back(ValueType::vtInt16, is_nullable);
else if (typeid_cast<const DataTypeInt32 *>(type))
else if (which.isInt32())
types.emplace_back(ValueType::vtInt32, is_nullable);
else if (typeid_cast<const DataTypeInt64 *>(type))
else if (which.isInt64())
types.emplace_back(ValueType::vtInt64, is_nullable);
else if (typeid_cast<const DataTypeFloat32 *>(type))
else if (which.isFloat32())
types.emplace_back(ValueType::vtFloat32, is_nullable);
else if (typeid_cast<const DataTypeFloat64 *>(type))
else if (which.isFloat64())
types.emplace_back(ValueType::vtFloat64, is_nullable);
else if (typeid_cast<const DataTypeString *>(type))
else if (which.isString())
types.emplace_back(ValueType::vtString, is_nullable);
else if (typeid_cast<const DataTypeDate *>(type))
else if (which.isDate())
types.emplace_back(ValueType::vtDate, is_nullable);
else if (typeid_cast<const DataTypeDateTime *>(type))
else if (which.isDateTime())
types.emplace_back(ValueType::vtDateTime, is_nullable);
else if (typeid_cast<const DataTypeUUID *>(type))
else if (which.isUUID())
types.emplace_back(ValueType::vtUUID, is_nullable);
else if (typeid_cast<const DataTypeEnum8 *>(type))
else if (which.isEnum8())
types.emplace_back(ValueType::vtString, is_nullable);
else if (typeid_cast<const DataTypeEnum16 *>(type))
else if (which.isEnum16())
types.emplace_back(ValueType::vtString, is_nullable);
else if (typeid_cast<const DataTypeDateTime64 *>(type))
else if (which.isDateTime64())
types.emplace_back(ValueType::vtDateTime64, is_nullable);
else if (typeid_cast<const DataTypeDecimal<Decimal32> *>(type))
else if (which.isDecimal32())
types.emplace_back(ValueType::vtDecimal32, is_nullable);
else if (typeid_cast<const DataTypeDecimal<Decimal64> *>(type))
else if (which.isDecimal64())
types.emplace_back(ValueType::vtDecimal64, is_nullable);
else if (typeid_cast<const DataTypeDecimal<Decimal128> *>(type))
else if (which.isDecimal128())
types.emplace_back(ValueType::vtDecimal128, is_nullable);
else if (typeid_cast<const DataTypeDecimal<Decimal256> *>(type))
else if (which.isDecimal256())
types.emplace_back(ValueType::vtDecimal256, is_nullable);
else if (typeid_cast<const DataTypeFixedString *>(type))
else if (which.isArray())
types.emplace_back(ValueType::vtArray, is_nullable);
else if (which.isFixedString())
types.emplace_back(ValueType::vtFixedString, is_nullable);
else
throw Exception{"Unsupported type " + type->getName(), ErrorCodes::UNKNOWN_TYPE};

View File

@ -31,6 +31,7 @@ struct ExternalResultDescription
vtDecimal64,
vtDecimal128,
vtDecimal256,
vtArray,
vtFixedString
};

View File

@ -17,6 +17,29 @@ namespace ErrorCodes
extern const int THERE_IS_NO_COLUMN;
}
NameAndTypePair::NameAndTypePair(
const String & name_in_storage_, const String & subcolumn_name_,
const DataTypePtr & type_in_storage_, const DataTypePtr & subcolumn_type_)
: name(name_in_storage_ + (subcolumn_name_.empty() ? "" : "." + subcolumn_name_))
, type(subcolumn_type_)
, type_in_storage(type_in_storage_)
, subcolumn_delimiter_position(name_in_storage_.size()) {}
String NameAndTypePair::getNameInStorage() const
{
if (!subcolumn_delimiter_position)
return name;
return name.substr(0, *subcolumn_delimiter_position);
}
String NameAndTypePair::getSubcolumnName() const
{
if (!subcolumn_delimiter_position)
return "";
return name.substr(*subcolumn_delimiter_position + 1, name.size() - *subcolumn_delimiter_position);
}
void NamesAndTypesList::readText(ReadBuffer & buf)
{
@ -137,25 +160,20 @@ NamesAndTypesList NamesAndTypesList::filter(const Names & names) const
NamesAndTypesList NamesAndTypesList::addTypes(const Names & names) const
{
/// NOTE: It's better to make a map in `IStorage` than to create it here every time again.
#if !defined(ARCADIA_BUILD)
google::dense_hash_map<StringRef, const DataTypePtr *, StringRefHash> types;
#else
google::sparsehash::dense_hash_map<StringRef, const DataTypePtr *, StringRefHash> types;
#endif
types.set_empty_key(StringRef());
std::unordered_map<std::string_view, const NameAndTypePair *> self_columns;
for (const NameAndTypePair & column : *this)
types[column.name] = &column.type;
for (const auto & column : *this)
self_columns[column.name] = &column;
NamesAndTypesList res;
for (const String & name : names)
{
auto it = types.find(name);
if (it == types.end())
auto it = self_columns.find(name);
if (it == self_columns.end())
throw Exception("No column " + name, ErrorCodes::THERE_IS_NO_COLUMN);
res.emplace_back(name, *it->second);
res.emplace_back(*it->second);
}
return res;
}

View File

@ -15,11 +15,19 @@ namespace DB
struct NameAndTypePair
{
String name;
DataTypePtr type;
public:
NameAndTypePair() = default;
NameAndTypePair(const String & name_, const DataTypePtr & type_)
: name(name_), type(type_), type_in_storage(type_) {}
NameAndTypePair() {}
NameAndTypePair(const String & name_, const DataTypePtr & type_) : name(name_), type(type_) {}
NameAndTypePair(const String & name_in_storage_, const String & subcolumn_name_,
const DataTypePtr & type_in_storage_, const DataTypePtr & subcolumn_type_);
String getNameInStorage() const;
String getSubcolumnName() const;
bool isSubcolumn() const { return subcolumn_delimiter_position != std::nullopt; }
DataTypePtr getTypeInStorage() const { return type_in_storage; }
bool operator<(const NameAndTypePair & rhs) const
{
@ -30,8 +38,26 @@ struct NameAndTypePair
{
return name == rhs.name && type->equals(*rhs.type);
}
String name;
DataTypePtr type;
private:
DataTypePtr type_in_storage;
std::optional<size_t> subcolumn_delimiter_position;
};
/// This needed to use structured bindings for NameAndTypePair
/// const auto & [name, type] = name_and_type
template <int I>
decltype(auto) get(const NameAndTypePair & name_and_type)
{
if constexpr (I == 0)
return name_and_type.name;
else if constexpr (I == 1)
return name_and_type.type;
}
using NamesAndTypes = std::vector<NameAndTypePair>;
class NamesAndTypesList : public std::list<NameAndTypePair>
@ -81,3 +107,10 @@ public:
};
}
namespace std
{
template <> struct tuple_size<DB::NameAndTypePair> : std::integral_constant<size_t, 2> {};
template <> struct tuple_element<0, DB::NameAndTypePair> { using type = DB::String; };
template <> struct tuple_element<1, DB::NameAndTypePair> { using type = DB::DataTypePtr; };
}

View File

@ -405,6 +405,7 @@ class IColumn;
M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \
M(Bool, enable_global_with_statement, false, "Propagate WITH statements to UNION queries and all subqueries", 0) \
M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \
M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \
M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \
M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \
M(Bool, optimize_skip_merged_partitions, false, "Skip partitions with one part with level > 0 in optimize final", 0) \

View File

@ -12,4 +12,4 @@
#cmakedefine01 USE_OPENCL
#cmakedefine01 USE_LDAP
#cmakedefine01 USE_ROCKSDB
#cmakedefine01 USE_LIBPQXX

View File

@ -71,7 +71,7 @@ void NativeBlockInputStream::resetParser()
is_killed.store(false);
}
void NativeBlockInputStream::readData(const IDataType & type, IColumn & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint)
void NativeBlockInputStream::readData(const IDataType & type, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint)
{
IDataType::DeserializeBinaryBulkSettings settings;
settings.getter = [&](IDataType::SubstreamPath) -> ReadBuffer * { return &istr; };
@ -82,8 +82,8 @@ void NativeBlockInputStream::readData(const IDataType & type, IColumn & column,
type.deserializeBinaryBulkStatePrefix(settings, state);
type.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state);
if (column.size() != rows)
throw Exception("Cannot read all data in NativeBlockInputStream. Rows read: " + toString(column.size()) + ". Rows expected: " + toString(rows) + ".",
if (column->size() != rows)
throw Exception("Cannot read all data in NativeBlockInputStream. Rows read: " + toString(column->size()) + ". Rows expected: " + toString(rows) + ".",
ErrorCodes::CANNOT_READ_ALL_DATA);
}
@ -158,11 +158,11 @@ Block NativeBlockInputStream::readImpl()
}
/// Data
MutableColumnPtr read_column = column.type->createColumn();
ColumnPtr read_column = column.type->createColumn();
double avg_value_size_hint = avg_value_size_hints.empty() ? 0 : avg_value_size_hints[i];
if (rows) /// If no rows, nothing to read.
readData(*column.type, *read_column, istr, rows, avg_value_size_hint);
readData(*column.type, read_column, istr, rows, avg_value_size_hint);
column.column = std::move(read_column);

View File

@ -74,7 +74,7 @@ public:
String getName() const override { return "Native"; }
static void readData(const IDataType & type, IColumn & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint);
static void readData(const IDataType & type, ColumnPtr & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint);
Block getHeader() const override;

View File

@ -0,0 +1,297 @@
#include "PostgreSQLBlockInputStream.h"
#if USE_LIBPQXX
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnDecimal.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesDecimal.h>
#include <Interpreters/convertFieldToType.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <Common/assert_cast.h>
#include <ext/range.h>
#include <common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
PostgreSQLBlockInputStream::PostgreSQLBlockInputStream(
ConnectionPtr connection_,
const std::string & query_str_,
const Block & sample_block,
const UInt64 max_block_size_)
: query_str(query_str_)
, max_block_size(max_block_size_)
, connection(connection_)
{
description.init(sample_block);
for (const auto idx : ext::range(0, description.sample_block.columns()))
if (description.types[idx].first == ValueType::vtArray)
prepareArrayInfo(idx, description.sample_block.getByPosition(idx).type);
/// pqxx::stream_from uses COPY command, will get error if ';' is present
if (query_str.ends_with(';'))
query_str.resize(query_str.size() - 1);
}
void PostgreSQLBlockInputStream::readPrefix()
{
tx = std::make_unique<pqxx::read_transaction>(*connection);
stream = std::make_unique<pqxx::stream_from>(*tx, pqxx::from_query, std::string_view(query_str));
}
Block PostgreSQLBlockInputStream::readImpl()
{
/// Check if pqxx::stream_from is finished
if (!stream || !(*stream))
return Block();
MutableColumns columns = description.sample_block.cloneEmptyColumns();
size_t num_rows = 0;
while (true)
{
const std::vector<pqxx::zview> * row{stream->read_row()};
/// row is nullptr if pqxx::stream_from is finished
if (!row)
break;
for (const auto idx : ext::range(0, row->size()))
{
const auto & sample = description.sample_block.getByPosition(idx);
/// if got NULL type, then pqxx::zview will return nullptr in c_str()
if ((*row)[idx].c_str())
{
if (description.types[idx].second)
{
ColumnNullable & column_nullable = assert_cast<ColumnNullable &>(*columns[idx]);
const auto & data_type = assert_cast<const DataTypeNullable &>(*sample.type);
insertValue(column_nullable.getNestedColumn(), (*row)[idx], description.types[idx].first, data_type.getNestedType(), idx);
column_nullable.getNullMapData().emplace_back(0);
}
else
{
insertValue(*columns[idx], (*row)[idx], description.types[idx].first, sample.type, idx);
}
}
else
{
insertDefaultValue(*columns[idx], *sample.column);
}
}
if (++num_rows == max_block_size)
break;
}
return description.sample_block.cloneWithColumns(std::move(columns));
}
void PostgreSQLBlockInputStream::readSuffix()
{
if (stream)
{
stream->complete();
tx->commit();
}
}
void PostgreSQLBlockInputStream::insertValue(IColumn & column, std::string_view value,
const ExternalResultDescription::ValueType type, const DataTypePtr data_type, size_t idx)
{
switch (type)
{
case ValueType::vtUInt8:
assert_cast<ColumnUInt8 &>(column).insertValue(pqxx::from_string<uint16_t>(value));
break;
case ValueType::vtUInt16:
assert_cast<ColumnUInt16 &>(column).insertValue(pqxx::from_string<uint16_t>(value));
break;
case ValueType::vtUInt32:
assert_cast<ColumnUInt32 &>(column).insertValue(pqxx::from_string<uint32_t>(value));
break;
case ValueType::vtUInt64:
assert_cast<ColumnUInt64 &>(column).insertValue(pqxx::from_string<uint64_t>(value));
break;
case ValueType::vtInt8:
assert_cast<ColumnInt8 &>(column).insertValue(pqxx::from_string<int16_t>(value));
break;
case ValueType::vtInt16:
assert_cast<ColumnInt16 &>(column).insertValue(pqxx::from_string<int16_t>(value));
break;
case ValueType::vtInt32:
assert_cast<ColumnInt32 &>(column).insertValue(pqxx::from_string<int32_t>(value));
break;
case ValueType::vtInt64:
assert_cast<ColumnInt64 &>(column).insertValue(pqxx::from_string<int64_t>(value));
break;
case ValueType::vtFloat32:
assert_cast<ColumnFloat32 &>(column).insertValue(pqxx::from_string<float>(value));
break;
case ValueType::vtFloat64:
assert_cast<ColumnFloat64 &>(column).insertValue(pqxx::from_string<double>(value));
break;
case ValueType::vtFixedString:[[fallthrough]];
case ValueType::vtString:
assert_cast<ColumnString &>(column).insertData(value.data(), value.size());
break;
case ValueType::vtUUID:
assert_cast<ColumnUInt128 &>(column).insert(parse<UUID>(value.data(), value.size()));
break;
case ValueType::vtDate:
assert_cast<ColumnUInt16 &>(column).insertValue(UInt16{LocalDate{std::string(value)}.getDayNum()});
break;
case ValueType::vtDateTime:
assert_cast<ColumnUInt32 &>(column).insertValue(time_t{LocalDateTime{std::string(value)}});
break;
case ValueType::vtDateTime64:[[fallthrough]];
case ValueType::vtDecimal32: [[fallthrough]];
case ValueType::vtDecimal64: [[fallthrough]];
case ValueType::vtDecimal128: [[fallthrough]];
case ValueType::vtDecimal256:
{
ReadBufferFromString istr(value);
data_type->deserializeAsWholeText(column, istr, FormatSettings{});
break;
}
case ValueType::vtArray:
{
pqxx::array_parser parser{value};
std::pair<pqxx::array_parser::juncture, std::string> parsed = parser.get_next();
size_t dimension = 0, max_dimension = 0, expected_dimensions = array_info[idx].num_dimensions;
const auto parse_value = array_info[idx].pqxx_parser;
std::vector<std::vector<Field>> dimensions(expected_dimensions + 1);
while (parsed.first != pqxx::array_parser::juncture::done)
{
if ((parsed.first == pqxx::array_parser::juncture::row_start) && (++dimension > expected_dimensions))
throw Exception("Got more dimensions than expected", ErrorCodes::BAD_ARGUMENTS);
else if (parsed.first == pqxx::array_parser::juncture::string_value)
dimensions[dimension].emplace_back(parse_value(parsed.second));
else if (parsed.first == pqxx::array_parser::juncture::null_value)
dimensions[dimension].emplace_back(array_info[idx].default_value);
else if (parsed.first == pqxx::array_parser::juncture::row_end)
{
max_dimension = std::max(max_dimension, dimension);
if (--dimension == 0)
break;
dimensions[dimension].emplace_back(Array(dimensions[dimension + 1].begin(), dimensions[dimension + 1].end()));
dimensions[dimension + 1].clear();
}
parsed = parser.get_next();
}
if (max_dimension < expected_dimensions)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Got less dimensions than expected. ({} instead of {})", max_dimension, expected_dimensions);
assert_cast<ColumnArray &>(column).insert(Array(dimensions[1].begin(), dimensions[1].end()));
break;
}
}
}
void PostgreSQLBlockInputStream::prepareArrayInfo(size_t column_idx, const DataTypePtr data_type)
{
const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get());
auto nested = array_type->getNestedType();
size_t count_dimensions = 1;
while (isArray(nested))
{
++count_dimensions;
nested = typeid_cast<const DataTypeArray *>(nested.get())->getNestedType();
}
Field default_value = nested->getDefault();
if (nested->isNullable())
nested = static_cast<const DataTypeNullable *>(nested.get())->getNestedType();
WhichDataType which(nested);
std::function<Field(std::string & fields)> parser;
if (which.isUInt8() || which.isUInt16())
parser = [](std::string & field) -> Field { return pqxx::from_string<uint16_t>(field); };
else if (which.isInt8() || which.isInt16())
parser = [](std::string & field) -> Field { return pqxx::from_string<int16_t>(field); };
else if (which.isUInt32())
parser = [](std::string & field) -> Field { return pqxx::from_string<uint32_t>(field); };
else if (which.isInt32())
parser = [](std::string & field) -> Field { return pqxx::from_string<int32_t>(field); };
else if (which.isUInt64())
parser = [](std::string & field) -> Field { return pqxx::from_string<uint64_t>(field); };
else if (which.isInt64())
parser = [](std::string & field) -> Field { return pqxx::from_string<int64_t>(field); };
else if (which.isFloat32())
parser = [](std::string & field) -> Field { return pqxx::from_string<float>(field); };
else if (which.isFloat64())
parser = [](std::string & field) -> Field { return pqxx::from_string<double>(field); };
else if (which.isString() || which.isFixedString())
parser = [](std::string & field) -> Field { return field; };
else if (which.isDate())
parser = [](std::string & field) -> Field { return UInt16{LocalDate{field}.getDayNum()}; };
else if (which.isDateTime())
parser = [](std::string & field) -> Field { return time_t{LocalDateTime{field}}; };
else if (which.isDecimal32())
parser = [nested](std::string & field) -> Field
{
const auto & type = typeid_cast<const DataTypeDecimal<Decimal32> *>(nested.get());
DataTypeDecimal<Decimal32> res(getDecimalPrecision(*type), getDecimalScale(*type));
return convertFieldToType(field, res);
};
else if (which.isDecimal64())
parser = [nested](std::string & field) -> Field
{
const auto & type = typeid_cast<const DataTypeDecimal<Decimal64> *>(nested.get());
DataTypeDecimal<Decimal64> res(getDecimalPrecision(*type), getDecimalScale(*type));
return convertFieldToType(field, res);
};
else if (which.isDecimal128())
parser = [nested](std::string & field) -> Field
{
const auto & type = typeid_cast<const DataTypeDecimal<Decimal128> *>(nested.get());
DataTypeDecimal<Decimal128> res(getDecimalPrecision(*type), getDecimalScale(*type));
return convertFieldToType(field, res);
};
else if (which.isDecimal256())
parser = [nested](std::string & field) -> Field
{
const auto & type = typeid_cast<const DataTypeDecimal<Decimal256> *>(nested.get());
DataTypeDecimal<Decimal256> res(getDecimalPrecision(*type), getDecimalScale(*type));
return convertFieldToType(field, res);
};
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type conversion to {} is not supported", nested->getName());
array_info[column_idx] = {count_dimensions, default_value, parser};
}
}
#endif

View File

@ -0,0 +1,65 @@
#pragma once
#if !defined(ARCADIA_BUILD)
#include "config_core.h"
#endif
#if USE_LIBPQXX
#include <Core/Block.h>
#include <DataStreams/IBlockInputStream.h>
#include <Core/ExternalResultDescription.h>
#include <Core/Field.h>
#include <pqxx/pqxx>
namespace DB
{
using ConnectionPtr = std::shared_ptr<pqxx::connection>;
class PostgreSQLBlockInputStream : public IBlockInputStream
{
public:
PostgreSQLBlockInputStream(
ConnectionPtr connection_,
const std::string & query_str,
const Block & sample_block,
const UInt64 max_block_size_);
String getName() const override { return "PostgreSQL"; }
Block getHeader() const override { return description.sample_block.cloneEmpty(); }
private:
using ValueType = ExternalResultDescription::ValueType;
void readPrefix() override;
Block readImpl() override;
void readSuffix() override;
void insertValue(IColumn & column, std::string_view value,
const ExternalResultDescription::ValueType type, const DataTypePtr data_type, size_t idx);
void insertDefaultValue(IColumn & column, const IColumn & sample_column)
{
column.insertFrom(sample_column, 0);
}
void prepareArrayInfo(size_t column_idx, const DataTypePtr data_type);
String query_str;
const UInt64 max_block_size;
ExternalResultDescription description;
ConnectionPtr connection;
std::unique_ptr<pqxx::read_transaction> tx;
std::unique_ptr<pqxx::stream_from> stream;
struct ArrayInfo
{
size_t num_dimensions;
Field default_value;
std::function<Field(std::string & field)> pqxx_parser;
};
std::unordered_map<size_t, ArrayInfo> array_info;
};
}
#endif

View File

@ -12,7 +12,7 @@ NO_COMPILER_WARNINGS()
SRCS(
<? find . -name '*.cpp' | grep -v -F tests | sed 's/^\.\// /' | sort ?>
<? find . -name '*.cpp' | grep -v -P 'tests|PostgreSQL' | sed 's/^\.\// /' | sort ?>
)
END()

View File

@ -10,12 +10,15 @@
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeOneElementTuple.h>
#include <Parsers/IAST.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h>
#include <Core/NamesAndTypes.h>
namespace DB
{
@ -145,10 +148,57 @@ namespace
offset_values.resize(i);
}
ColumnPtr arrayOffsetsToSizes(const IColumn & column)
{
const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column);
MutableColumnPtr column_sizes = column_offsets.cloneEmpty();
if (column_offsets.empty())
return column_sizes;
const auto & offsets_data = column_offsets.getData();
auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData();
sizes_data.resize(offsets_data.size());
IColumn::Offset prev_offset = 0;
for (size_t i = 0, size = offsets_data.size(); i < size; ++i)
{
auto current_offset = offsets_data[i];
sizes_data[i] = current_offset - prev_offset;
prev_offset = current_offset;
}
return column_sizes;
}
ColumnPtr arraySizesToOffsets(const IColumn & column)
{
const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column);
MutableColumnPtr column_offsets = column_sizes.cloneEmpty();
if (column_sizes.empty())
return column_offsets;
const auto & sizes_data = column_sizes.getData();
auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData();
offsets_data.resize(sizes_data.size());
IColumn::Offset prev_offset = 0;
for (size_t i = 0, size = sizes_data.size(); i < size; ++i)
{
prev_offset += sizes_data[i];
offsets_data[i] = prev_offset;
}
return column_offsets;
}
}
void DataTypeArray::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
void DataTypeArray::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::ArraySizes);
callback(path, *this);
@ -158,7 +208,7 @@ void DataTypeArray::enumerateStreams(const StreamCallback & callback, SubstreamP
}
void DataTypeArray::serializeBinaryBulkStatePrefix(
void DataTypeArray::serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -168,7 +218,7 @@ void DataTypeArray::serializeBinaryBulkStatePrefix(
}
void DataTypeArray::serializeBinaryBulkStateSuffix(
void DataTypeArray::serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -178,7 +228,7 @@ void DataTypeArray::serializeBinaryBulkStateSuffix(
}
void DataTypeArray::deserializeBinaryBulkStatePrefix(
void DataTypeArray::deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
@ -188,7 +238,7 @@ void DataTypeArray::deserializeBinaryBulkStatePrefix(
}
void DataTypeArray::serializeBinaryBulkWithMultipleStreams(
void DataTypeArray::serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
@ -235,44 +285,52 @@ void DataTypeArray::serializeBinaryBulkWithMultipleStreams(
}
void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
void DataTypeArray::deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
ColumnArray & column_array = typeid_cast<ColumnArray &>(column);
settings.path.push_back(Substream::ArraySizes);
if (auto * stream = settings.getter(settings.path))
if (auto cached_column = getFromSubstreamsCache(cache, settings.path))
{
column_array.getOffsetsPtr() = arraySizesToOffsets(*cached_column);
}
else if (auto * stream = settings.getter(settings.path))
{
if (settings.position_independent_encoding)
deserializeArraySizesPositionIndependent(column, *stream, limit);
else
DataTypeNumber<ColumnArray::Offset>().deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0);
addToSubstreamsCache(cache, settings.path, arrayOffsetsToSizes(column_array.getOffsetsColumn()));
}
settings.path.back() = Substream::ArrayElements;
ColumnArray::Offsets & offset_values = column_array.getOffsets();
IColumn & nested_column = column_array.getData();
ColumnPtr & nested_column = column_array.getDataPtr();
/// Number of values corresponding with `offset_values` must be read.
size_t last_offset = offset_values.back();
if (last_offset < nested_column.size())
if (last_offset < nested_column->size())
throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR);
size_t nested_limit = last_offset - nested_column.size();
size_t nested_limit = last_offset - nested_column->size();
/// Adjust value size hint. Divide it to the average array size.
settings.avg_value_size_hint = nested_limit ? settings.avg_value_size_hint / nested_limit * offset_values.size() : 0;
nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state);
nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state, cache);
settings.path.pop_back();
/// Check consistency between offsets and elements subcolumns.
/// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER.
if (!nested_column.empty() && nested_column.size() != last_offset)
throw ParsingException("Cannot read all array values: read just " + toString(nested_column.size()) + " of " + toString(last_offset),
if (!nested_column->empty() && nested_column->size() != last_offset)
throw ParsingException("Cannot read all array values: read just " + toString(nested_column->size()) + " of " + toString(last_offset),
ErrorCodes::CANNOT_READ_ALL_DATA);
}
@ -530,6 +588,44 @@ bool DataTypeArray::equals(const IDataType & rhs) const
return typeid(rhs) == typeid(*this) && nested->equals(*static_cast<const DataTypeArray &>(rhs).nested);
}
DataTypePtr DataTypeArray::tryGetSubcolumnType(const String & subcolumn_name) const
{
return tryGetSubcolumnTypeImpl(subcolumn_name, 0);
}
DataTypePtr DataTypeArray::tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const
{
if (subcolumn_name == "size" + std::to_string(level))
return createOneElementTuple(std::make_shared<DataTypeUInt64>(), subcolumn_name, false);
DataTypePtr subcolumn;
if (const auto * nested_array = typeid_cast<const DataTypeArray *>(nested.get()))
subcolumn = nested_array->tryGetSubcolumnTypeImpl(subcolumn_name, level + 1);
else
subcolumn = nested->tryGetSubcolumnType(subcolumn_name);
return (subcolumn ? std::make_shared<DataTypeArray>(std::move(subcolumn)) : subcolumn);
}
ColumnPtr DataTypeArray::getSubcolumn(const String & subcolumn_name, const IColumn & column) const
{
return getSubcolumnImpl(subcolumn_name, column, 0);
}
ColumnPtr DataTypeArray::getSubcolumnImpl(const String & subcolumn_name, const IColumn & column, size_t level) const
{
const auto & column_array = assert_cast<const ColumnArray &>(column);
if (subcolumn_name == "size" + std::to_string(level))
return arrayOffsetsToSizes(column_array.getOffsetsColumn());
ColumnPtr subcolumn;
if (const auto * nested_array = typeid_cast<const DataTypeArray *>(nested.get()))
subcolumn = nested_array->getSubcolumnImpl(subcolumn_name, column_array.getData(), level + 1);
else
subcolumn = nested->getSubcolumn(subcolumn_name, column_array.getData());
return ColumnArray::create(subcolumn, column_array.getOffsetsPtr());
}
size_t DataTypeArray::getNumberOfDimensions() const
{

View File

@ -57,32 +57,33 @@ public:
* This is necessary, because when implementing nested structures, several arrays can have common sizes.
*/
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
void serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
void serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
void deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
void serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
void deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeProtobuf(const IColumn & column,
size_t row_num,
@ -111,10 +112,17 @@ public:
return nested->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion();
}
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override;
ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override;
const DataTypePtr & getNestedType() const { return nested; }
/// 1 for plain array, 2 for array of arrays and so on.
size_t getNumberOfDimensions() const;
private:
ColumnPtr getSubcolumnImpl(const String & subcolumn_name, const IColumn & column, size_t level) const;
DataTypePtr tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const;
};
}

View File

@ -3,6 +3,7 @@
#include <memory>
#include <cstddef>
#include <Core/Types.h>
#include <DataTypes/IDataType.h>
namespace DB
{
@ -62,8 +63,51 @@ public:
virtual void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const = 0;
};
/** Allows to customize an existing data type by representation with custom substreams.
* Customized data type will be serialized/deserialized to files with different names than base type,
* but binary and text representation will be unchanged.
* E.g it can be used for reading single subcolumns of complex types.
*/
class IDataTypeCustomStreams
{
public:
virtual ~IDataTypeCustomStreams() = default;
virtual void enumerateStreams(
const IDataType::StreamCallback & callback,
IDataType::SubstreamPath & path) const = 0;
virtual void serializeBinaryBulkStatePrefix(
IDataType::SerializeBinaryBulkSettings & settings,
IDataType::SerializeBinaryBulkStatePtr & state) const = 0;
virtual void serializeBinaryBulkStateSuffix(
IDataType::SerializeBinaryBulkSettings & settings,
IDataType::SerializeBinaryBulkStatePtr & state) const = 0;
virtual void deserializeBinaryBulkStatePrefix(
IDataType::DeserializeBinaryBulkSettings & settings,
IDataType::DeserializeBinaryBulkStatePtr & state) const = 0;
virtual void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
IDataType::SerializeBinaryBulkSettings & settings,
IDataType::SerializeBinaryBulkStatePtr & state) const = 0;
virtual void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
IDataType::DeserializeBinaryBulkSettings & settings,
IDataType::DeserializeBinaryBulkStatePtr & state,
IDataType::SubstreamsCache * cache) const = 0;
};
using DataTypeCustomNamePtr = std::unique_ptr<const IDataTypeCustomName>;
using DataTypeCustomTextSerializationPtr = std::unique_ptr<const IDataTypeCustomTextSerialization>;
using DataTypeCustomStreamsPtr = std::unique_ptr<const IDataTypeCustomStreams>;
/** Describe a data type customization
*/
@ -71,9 +115,15 @@ struct DataTypeCustomDesc
{
DataTypeCustomNamePtr name;
DataTypeCustomTextSerializationPtr text_serialization;
DataTypeCustomStreamsPtr streams;
DataTypeCustomDesc(DataTypeCustomNamePtr name_, DataTypeCustomTextSerializationPtr text_serialization_)
: name(std::move(name_)), text_serialization(std::move(text_serialization_)) {}
DataTypeCustomDesc(
DataTypeCustomNamePtr name_,
DataTypeCustomTextSerializationPtr text_serialization_ = nullptr,
DataTypeCustomStreamsPtr streams_ = nullptr)
: name(std::move(name_))
, text_serialization(std::move(text_serialization_))
, streams(std::move(streams_)) {}
};
using DataTypeCustomDescPtr = std::unique_ptr<DataTypeCustomDesc>;

View File

@ -0,0 +1,18 @@
#pragma once
#include <memory>
namespace DB
{
class IDataTypeCustomName;
class IDataTypeCustomTextSerialization;
class IDataTypeCustomStreams;
struct DataTypeCustomDesc;
using DataTypeCustomNamePtr = std::unique_ptr<const IDataTypeCustomName>;
using DataTypeCustomTextSerializationPtr = std::unique_ptr<const IDataTypeCustomTextSerialization>;
using DataTypeCustomStreamsPtr = std::unique_ptr<const IDataTypeCustomStreams>;
using DataTypeCustomDescPtr = std::unique_ptr<DataTypeCustomDesc>;
}

View File

@ -79,6 +79,16 @@ DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr
return findCreatorByName(family_name)(parameters);
}
DataTypePtr DataTypeFactory::getCustom(DataTypeCustomDescPtr customization) const
{
if (!customization->name)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create custom type without name");
auto type = get(customization->name->getName());
type->setCustomization(std::move(customization));
return type;
}
void DataTypeFactory::registerDataType(const String & family_name, Value creator, CaseSensitiveness case_sensitiveness)
{

View File

@ -3,6 +3,7 @@
#include <DataTypes/IDataType.h>
#include <Parsers/IAST_fwd.h>
#include <Common/IFactoryWithAliases.h>
#include <DataTypes/DataTypeCustom_fwd.h>
#include <functional>
@ -33,6 +34,7 @@ public:
DataTypePtr get(const String & full_name) const;
DataTypePtr get(const String & family_name, const ASTPtr & parameters) const;
DataTypePtr get(const ASTPtr & ast) const;
DataTypePtr getCustom(DataTypeCustomDescPtr customization) const;
/// Register a type family by its name.
void registerDataType(const String & family_name, Value creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
@ -84,5 +86,6 @@ void registerDataTypeLowCardinality(DataTypeFactory & factory);
void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory);
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory);
void registerDataTypeDomainGeo(DataTypeFactory & factory);
void registerDataTypeOneElementTuple(DataTypeFactory & factory);
}

View File

@ -50,7 +50,7 @@ DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_)
+ dictionary_type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
void DataTypeLowCardinality::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
void DataTypeLowCardinality::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::DictionaryKeys);
dictionary_type->enumerateStreams(callback, path);
@ -243,7 +243,7 @@ static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeStat
return low_cardinality_state;
}
void DataTypeLowCardinality::serializeBinaryBulkStatePrefix(
void DataTypeLowCardinality::serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -263,7 +263,7 @@ void DataTypeLowCardinality::serializeBinaryBulkStatePrefix(
state = std::make_shared<SerializeStateLowCardinality>(key_version);
}
void DataTypeLowCardinality::serializeBinaryBulkStateSuffix(
void DataTypeLowCardinality::serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -289,7 +289,7 @@ void DataTypeLowCardinality::serializeBinaryBulkStateSuffix(
}
}
void DataTypeLowCardinality::deserializeBinaryBulkStatePrefix(
void DataTypeLowCardinality::deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
@ -482,7 +482,7 @@ namespace
}
}
void DataTypeLowCardinality::serializeBinaryBulkWithMultipleStreams(
void DataTypeLowCardinality::serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
@ -579,11 +579,12 @@ void DataTypeLowCardinality::serializeBinaryBulkWithMultipleStreams(
index_version.getDataType()->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows);
}
void DataTypeLowCardinality::deserializeBinaryBulkWithMultipleStreams(
void DataTypeLowCardinality::deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * /* cache */) const
{
ColumnLowCardinality & low_cardinality_column = typeid_cast<ColumnLowCardinality &>(column);

View File

@ -22,32 +22,33 @@ public:
const char * getFamilyName() const override { return "LowCardinality"; }
TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; }
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
void serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
void serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
void deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
void serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
void deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
void deserializeBinary(Field & field, ReadBuffer & istr) const override;

View File

@ -278,34 +278,34 @@ void DataTypeMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const
}
void DataTypeMap::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
void DataTypeMap::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const
{
nested->enumerateStreams(callback, path);
}
void DataTypeMap::serializeBinaryBulkStatePrefix(
void DataTypeMap::serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
nested->serializeBinaryBulkStatePrefix(settings, state);
}
void DataTypeMap::serializeBinaryBulkStateSuffix(
void DataTypeMap::serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
nested->serializeBinaryBulkStateSuffix(settings, state);
}
void DataTypeMap::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
void DataTypeMap::deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
nested->deserializeBinaryBulkStatePrefix(settings, state);
}
void DataTypeMap::serializeBinaryBulkWithMultipleStreams(
void DataTypeMap::serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
@ -315,13 +315,15 @@ void DataTypeMap::serializeBinaryBulkWithMultipleStreams(
nested->serializeBinaryBulkWithMultipleStreams(extractNestedColumn(column), offset, limit, settings, state);
}
void DataTypeMap::deserializeBinaryBulkWithMultipleStreams(
void DataTypeMap::deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
nested->deserializeBinaryBulkWithMultipleStreams(extractNestedColumn(column), limit, settings, state);
auto & column_map = assert_cast<ColumnMap &>(column);
nested->deserializeBinaryBulkWithMultipleStreams(column_map.getNestedColumnPtr(), limit, settings, state, cache);
}
void DataTypeMap::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const

View File

@ -46,34 +46,33 @@ public:
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
/** Each sub-column in a map is serialized in separate stream.
*/
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
void serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
void serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
void deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
void serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
void deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override;
void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override;

View File

@ -0,0 +1,76 @@
#include <DataTypes/DataTypeNested.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <Common/quoteString.h>
#include <Parsers/ASTNameTypePair.h>
namespace DB
{
namespace ErrorCodes
{
extern const int EMPTY_DATA_PASSED;
extern const int BAD_ARGUMENTS;
}
String DataTypeNestedCustomName::getName() const
{
WriteBufferFromOwnString s;
s << "Nested(";
for (size_t i = 0; i < elems.size(); ++i)
{
if (i != 0)
s << ", ";
s << backQuoteIfNeed(names[i]) << ' ';
s << elems[i]->getName();
}
s << ")";
return s.str();
}
static std::pair<DataTypePtr, DataTypeCustomDescPtr> create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.empty())
throw Exception("Nested cannot be empty", ErrorCodes::EMPTY_DATA_PASSED);
DataTypes nested_types;
Strings nested_names;
nested_types.reserve(arguments->children.size());
nested_names.reserve(arguments->children.size());
for (const auto & child : arguments->children)
{
const auto * name_type = child->as<ASTNameTypePair>();
if (!name_type)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Data type Nested accepts only pairs with name and type");
auto nested_type = DataTypeFactory::instance().get(name_type->type);
nested_types.push_back(std::move(nested_type));
nested_names.push_back(name_type->name);
}
auto data_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(nested_types, nested_names));
auto custom_name = std::make_unique<DataTypeNestedCustomName>(nested_types, nested_names);
return std::make_pair(std::move(data_type), std::make_unique<DataTypeCustomDesc>(std::move(custom_name), nullptr));
}
void registerDataTypeNested(DataTypeFactory & factory)
{
return factory.registerDataTypeCustom("Nested", create);
}
DataTypePtr createNested(const DataTypes & types, const Names & names)
{
auto custom_desc = std::make_unique<DataTypeCustomDesc>(
std::make_unique<DataTypeNestedCustomName>(types, names));
return DataTypeFactory::instance().getCustom(std::move(custom_desc));
}
}

View File

@ -0,0 +1,34 @@
#pragma once
#include <DataTypes/DataTypeWithSimpleSerialization.h>
#include <DataTypes/DataTypeCustom.h>
namespace DB
{
class DataTypeNestedCustomName final : public IDataTypeCustomName
{
private:
DataTypes elems;
Strings names;
public:
DataTypeNestedCustomName(const DataTypes & elems_, const Strings & names_)
: elems(elems_), names(names_)
{
}
String getName() const override;
};
DataTypePtr createNested(const DataTypes & types, const Names & names);
template <typename DataType>
inline bool isNested(const DataType & data_type)
{
return typeid_cast<const DataTypeNestedCustomName *>(data_type->getCustomName()) != nullptr;
}
}

View File

@ -2,6 +2,7 @@
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeOneElementTuple.h>
#include <Columns/ColumnNullable.h>
#include <Core/Field.h>
#include <IO/ReadBuffer.h>
@ -41,7 +42,7 @@ bool DataTypeNullable::onlyNull() const
}
void DataTypeNullable::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
void DataTypeNullable::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::NullMap);
callback(path, *this);
@ -51,7 +52,7 @@ void DataTypeNullable::enumerateStreams(const StreamCallback & callback, Substre
}
void DataTypeNullable::serializeBinaryBulkStatePrefix(
void DataTypeNullable::serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -61,7 +62,7 @@ void DataTypeNullable::serializeBinaryBulkStatePrefix(
}
void DataTypeNullable::serializeBinaryBulkStateSuffix(
void DataTypeNullable::serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -71,7 +72,7 @@ void DataTypeNullable::serializeBinaryBulkStateSuffix(
}
void DataTypeNullable::deserializeBinaryBulkStatePrefix(
void DataTypeNullable::deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
@ -81,7 +82,7 @@ void DataTypeNullable::deserializeBinaryBulkStatePrefix(
}
void DataTypeNullable::serializeBinaryBulkWithMultipleStreams(
void DataTypeNullable::serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
@ -103,20 +104,28 @@ void DataTypeNullable::serializeBinaryBulkWithMultipleStreams(
}
void DataTypeNullable::deserializeBinaryBulkWithMultipleStreams(
void DataTypeNullable::deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
ColumnNullable & col = assert_cast<ColumnNullable &>(column);
settings.path.push_back(Substream::NullMap);
if (auto * stream = settings.getter(settings.path))
if (auto cached_column = getFromSubstreamsCache(cache, settings.path))
{
col.getNullMapColumnPtr() = cached_column;
}
else if (auto * stream = settings.getter(settings.path))
{
DataTypeUInt8().deserializeBinaryBulk(col.getNullMapColumn(), *stream, limit, 0);
addToSubstreamsCache(cache, settings.path, col.getNullMapColumnPtr());
}
settings.path.back() = Substream::NullableElements;
nested_data_type->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), limit, settings, state);
nested_data_type->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumnPtr(), limit, settings, state, cache);
settings.path.pop_back();
}
@ -525,6 +534,23 @@ bool DataTypeNullable::equals(const IDataType & rhs) const
return rhs.isNullable() && nested_data_type->equals(*static_cast<const DataTypeNullable &>(rhs).nested_data_type);
}
DataTypePtr DataTypeNullable::tryGetSubcolumnType(const String & subcolumn_name) const
{
if (subcolumn_name == "null")
return createOneElementTuple(std::make_shared<DataTypeUInt8>(), subcolumn_name, false);
return nested_data_type->tryGetSubcolumnType(subcolumn_name);
}
ColumnPtr DataTypeNullable::getSubcolumn(const String & subcolumn_name, const IColumn & column) const
{
const auto & column_nullable = assert_cast<const ColumnNullable &>(column);
if (subcolumn_name == "null")
return column_nullable.getNullMapColumnPtr()->assumeMutable();
return nested_data_type->getSubcolumn(subcolumn_name, column_nullable.getNestedColumn());
}
static DataTypePtr create(const ASTPtr & arguments)
{

View File

@ -18,32 +18,33 @@ public:
const char * getFamilyName() const override { return "Nullable"; }
TypeIndex getTypeId() const override { return TypeIndex::Nullable; }
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
void serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
void serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
void deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
void serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
void deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
void deserializeBinary(Field & field, ReadBuffer & istr) const override;
@ -97,6 +98,8 @@ public:
size_t getSizeOfValueInMemory() const override;
bool onlyNull() const override;
bool canBeInsideLowCardinality() const override { return nested_data_type->canBeInsideLowCardinality(); }
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override;
ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override;
const DataTypePtr & getNestedType() const { return nested_data_type; }

View File

@ -0,0 +1,112 @@
#include <DataTypes/DataTypeOneElementTuple.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeCustom.h>
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <Common/quoteString.h>
#include <Parsers/ASTNameTypePair.h>
#include <Columns/IColumn.h>
namespace DB
{
namespace
{
/** Custom substreams representation for single subcolumn.
* It serializes/deserializes column as a nested type, but in that way
* if it was a named tuple with one element and a given name.
*/
class DataTypeOneElementTupleStreams : public IDataTypeCustomStreams
{
private:
DataTypePtr nested;
String name;
bool escape_delimiter;
public:
DataTypeOneElementTupleStreams(const DataTypePtr & nested_, const String & name_, bool escape_delimiter_)
: nested(nested_), name(name_), escape_delimiter(escape_delimiter_) {}
void enumerateStreams(
const IDataType::StreamCallback & callback,
IDataType::SubstreamPath & path) const override
{
addToPath(path);
nested->enumerateStreams(callback, path);
path.pop_back();
}
void serializeBinaryBulkStatePrefix(
IDataType:: SerializeBinaryBulkSettings & settings,
IDataType::SerializeBinaryBulkStatePtr & state) const override
{
addToPath(settings.path);
nested->serializeBinaryBulkStatePrefix(settings, state);
settings.path.pop_back();
}
void serializeBinaryBulkStateSuffix(
IDataType::SerializeBinaryBulkSettings & settings,
IDataType::SerializeBinaryBulkStatePtr & state) const override
{
addToPath(settings.path);
nested->serializeBinaryBulkStateSuffix(settings, state);
settings.path.pop_back();
}
void deserializeBinaryBulkStatePrefix(
IDataType::DeserializeBinaryBulkSettings & settings,
IDataType::DeserializeBinaryBulkStatePtr & state) const override
{
addToPath(settings.path);
nested->deserializeBinaryBulkStatePrefix(settings, state);
settings.path.pop_back();
}
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
IDataType::SerializeBinaryBulkSettings & settings,
IDataType::SerializeBinaryBulkStatePtr & state) const override
{
addToPath(settings.path);
nested->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state);
settings.path.pop_back();
}
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
IDataType::DeserializeBinaryBulkSettings & settings,
IDataType::DeserializeBinaryBulkStatePtr & state,
IDataType::SubstreamsCache * cache) const override
{
addToPath(settings.path);
nested->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache);
settings.path.pop_back();
}
private:
void addToPath(IDataType::SubstreamPath & path) const
{
path.push_back(IDataType::Substream::TupleElement);
path.back().tuple_element_name = name;
path.back().escape_tuple_delimiter = escape_delimiter;
}
};
}
DataTypePtr createOneElementTuple(const DataTypePtr & type, const String & name, bool escape_delimiter)
{
auto custom_desc = std::make_unique<DataTypeCustomDesc>(
std::make_unique<DataTypeCustomFixedName>(type->getName()),nullptr,
std::make_unique<DataTypeOneElementTupleStreams>(type, name, escape_delimiter));
return DataTypeFactory::instance().getCustom(std::move(custom_desc));
}
}

View File

@ -0,0 +1,10 @@
#pragma once
#include <DataTypes/IDataType.h>
namespace DB
{
DataTypePtr createOneElementTuple(const DataTypePtr & type, const String & name, bool escape_delimiter = true);
}

View File

@ -5,6 +5,7 @@
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeOneElementTuple.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTNameTypePair.h>
#include <Common/typeid_cast.h>
@ -30,6 +31,7 @@ namespace ErrorCodes
extern const int EMPTY_DATA_PASSED;
extern const int LOGICAL_ERROR;
extern const int NOT_FOUND_COLUMN_IN_BLOCK;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
}
@ -357,7 +359,7 @@ void DataTypeTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, cons
});
}
void DataTypeTuple::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
void DataTypeTuple::enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::TupleElement);
for (const auto i : ext::range(0, ext::size(elems)))
@ -412,7 +414,7 @@ static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(IDataT
return tuple_state;
}
void DataTypeTuple::serializeBinaryBulkStatePrefix(
void DataTypeTuple::serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -430,7 +432,7 @@ void DataTypeTuple::serializeBinaryBulkStatePrefix(
state = std::move(tuple_state);
}
void DataTypeTuple::serializeBinaryBulkStateSuffix(
void DataTypeTuple::serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
@ -445,7 +447,7 @@ void DataTypeTuple::serializeBinaryBulkStateSuffix(
settings.path.pop_back();
}
void DataTypeTuple::deserializeBinaryBulkStatePrefix(
void DataTypeTuple::deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
@ -463,7 +465,7 @@ void DataTypeTuple::deserializeBinaryBulkStatePrefix(
state = std::move(tuple_state);
}
void DataTypeTuple::serializeBinaryBulkWithMultipleStreams(
void DataTypeTuple::serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
@ -482,21 +484,22 @@ void DataTypeTuple::serializeBinaryBulkWithMultipleStreams(
settings.path.pop_back();
}
void DataTypeTuple::deserializeBinaryBulkWithMultipleStreams(
void DataTypeTuple::deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
auto * tuple_state = checkAndGetTupleDeserializeState(state);
auto & column_tuple = assert_cast<ColumnTuple &>(column);
settings.path.push_back(Substream::TupleElement);
settings.avg_value_size_hint = 0;
for (const auto i : ext::range(0, ext::size(elems)))
{
settings.path.back().tuple_element_name = names[i];
auto & element_col = extractElementColumn(column, i);
elems[i]->deserializeBinaryBulkWithMultipleStreams(element_col, limit, settings, tuple_state->states[i]);
elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache);
}
settings.path.pop_back();
}
@ -611,6 +614,47 @@ size_t DataTypeTuple::getSizeOfValueInMemory() const
return res;
}
DataTypePtr DataTypeTuple::tryGetSubcolumnType(const String & subcolumn_name) const
{
for (size_t i = 0; i < names.size(); ++i)
{
if (startsWith(subcolumn_name, names[i]))
{
size_t name_length = names[i].size();
DataTypePtr subcolumn_type;
if (subcolumn_name.size() == name_length)
subcolumn_type = elems[i];
else if (subcolumn_name[name_length] == '.')
subcolumn_type = elems[i]->tryGetSubcolumnType(subcolumn_name.substr(name_length + 1));
if (subcolumn_type)
return createOneElementTuple(std::move(subcolumn_type), names[i]);
}
}
return nullptr;
}
ColumnPtr DataTypeTuple::getSubcolumn(const String & subcolumn_name, const IColumn & column) const
{
for (size_t i = 0; i < names.size(); ++i)
{
if (startsWith(subcolumn_name, names[i]))
{
size_t name_length = names[i].size();
const auto & subcolumn = extractElementColumn(column, i);
if (subcolumn_name.size() == name_length)
return subcolumn.assumeMutable();
if (subcolumn_name[name_length] == '.')
return elems[i]->getSubcolumn(subcolumn_name.substr(name_length + 1), subcolumn);
}
}
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
}
static DataTypePtr create(const ASTPtr & arguments)
{
@ -648,13 +692,4 @@ void registerDataTypeTuple(DataTypeFactory & factory)
factory.registerDataType("Tuple", create);
}
void registerDataTypeNested(DataTypeFactory & factory)
{
/// Nested(...) data type is just a sugar for Array(Tuple(...))
factory.registerDataType("Nested", [&factory](const ASTPtr & arguments)
{
return std::make_shared<DataTypeArray>(factory.get("Tuple", arguments));
});
}
}

View File

@ -53,32 +53,33 @@ public:
/** Each sub-column in a tuple is serialized in separate stream.
*/
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
void serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
void serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
void deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
void serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
void deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override;
void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override;
@ -98,6 +99,9 @@ public:
size_t getMaximumSizeOfValueInMemory() const override;
size_t getSizeOfValueInMemory() const override;
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override;
ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const override;
const DataTypes & getElements() const { return elems; }
const Strings & getElementNames() const { return names; }

View File

@ -3,8 +3,10 @@
#include <Common/Exception.h>
#include <Common/escapeForFileName.h>
#include <Common/SipHash.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypeCustom.h>
@ -19,9 +21,48 @@ namespace ErrorCodes
extern const int MULTIPLE_STREAMS_REQUIRED;
extern const int LOGICAL_ERROR;
extern const int DATA_TYPE_CANNOT_BE_PROMOTED;
extern const int ILLEGAL_COLUMN;
}
IDataType::IDataType() : custom_name(nullptr), custom_text_serialization(nullptr)
String IDataType::Substream::toString() const
{
switch (type)
{
case ArrayElements:
return "ArrayElements";
case ArraySizes:
return "ArraySizes";
case NullableElements:
return "NullableElements";
case NullMap:
return "NullMap";
case TupleElement:
return "TupleElement(" + tuple_element_name + ", "
+ std::to_string(escape_tuple_delimiter) + ")";
case DictionaryKeys:
return "DictionaryKeys";
case DictionaryIndexes:
return "DictionaryIndexes";
}
__builtin_unreachable();
}
String IDataType::SubstreamPath::toString() const
{
WriteBufferFromOwnString wb;
wb << "{";
for (size_t i = 0; i < size(); ++i)
{
if (i != 0)
wb << ", ";
wb << at(i).toString();
}
wb << "}";
return wb.str();
}
IDataType::IDataType() : custom_name(nullptr), custom_text_serialization(nullptr), custom_streams(nullptr)
{
}
@ -93,42 +134,89 @@ size_t IDataType::getSizeOfValueInMemory() const
throw Exception("Value of type " + getName() + " in memory is not of fixed size.", ErrorCodes::LOGICAL_ERROR);
}
String IDataType::getFileNameForStream(const String & column_name, const IDataType::SubstreamPath & path)
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
{
/// Sizes of arrays (elements of Nested type) are shared (all reside in single file).
String nested_table_name = Nested::extractTableName(column_name);
auto subcolumn_type = tryGetSubcolumnType(subcolumn_name);
if (subcolumn_type)
return subcolumn_type;
bool is_sizes_of_nested_type =
path.size() == 1 /// Nested structure may have arrays as nested elements (so effectively we have multidimensional arrays).
/// Sizes of arrays are shared only at first level.
&& path[0].type == IDataType::Substream::ArraySizes
&& nested_table_name != column_name;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
}
size_t array_level = 0;
String stream_name = escapeForFileName(is_sizes_of_nested_type ? nested_table_name : column_name);
for (const Substream & elem : path)
ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const IColumn &) const
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
}
Names IDataType::getSubcolumnNames() const
{
NameSet res;
enumerateStreams([&res, this](const SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
if (elem.type == Substream::NullMap)
stream_name += ".null";
else if (elem.type == Substream::ArraySizes)
stream_name += ".size" + toString(array_level);
else if (elem.type == Substream::ArrayElements)
++array_level;
else if (elem.type == Substream::TupleElement)
SubstreamPath new_path;
/// Iterate over path to try to get intermediate subcolumns for complex nested types.
for (const auto & elem : substream_path)
{
/// For compatibility reasons, we use %2E instead of dot.
new_path.push_back(elem);
auto subcolumn_name = getSubcolumnNameForStream(new_path);
if (!subcolumn_name.empty() && tryGetSubcolumnType(subcolumn_name))
res.insert(subcolumn_name);
}
});
return Names(std::make_move_iterator(res.begin()), std::make_move_iterator(res.end()));
}
static String getNameForSubstreamPath(
String stream_name,
const IDataType::SubstreamPath & path,
bool escape_tuple_delimiter)
{
size_t array_level = 0;
for (const auto & elem : path)
{
if (elem.type == IDataType::Substream::NullMap)
stream_name += ".null";
else if (elem.type == IDataType::Substream::ArraySizes)
stream_name += ".size" + toString(array_level);
else if (elem.type == IDataType::Substream::ArrayElements)
++array_level;
else if (elem.type == IDataType::Substream::DictionaryKeys)
stream_name += ".dict";
else if (elem.type == IDataType::Substream::TupleElement)
{
/// For compatibility reasons, we use %2E (escaped dot) instead of dot.
/// Because nested data may be represented not by Array of Tuple,
/// but by separate Array columns with names in a form of a.b,
/// and name is encoded as a whole.
stream_name += "%2E" + escapeForFileName(elem.tuple_element_name);
stream_name += (escape_tuple_delimiter && elem.escape_tuple_delimiter ?
escapeForFileName(".") : ".") + escapeForFileName(elem.tuple_element_name);
}
else if (elem.type == Substream::DictionaryKeys)
stream_name += ".dict";
}
return stream_name;
}
String IDataType::getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path)
{
auto name_in_storage = column.getNameInStorage();
auto nested_storage_name = Nested::extractTableName(name_in_storage);
if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes))
name_in_storage = nested_storage_name;
auto stream_name = escapeForFileName(name_in_storage);
return getNameForSubstreamPath(std::move(stream_name), path, true);
}
String IDataType::getSubcolumnNameForStream(const SubstreamPath & path)
{
auto subcolumn_name = getNameForSubstreamPath("", path, false);
if (!subcolumn_name.empty())
subcolumn_name = subcolumn_name.substr(1); // It starts with a dot.
return subcolumn_name;
}
bool IDataType::isSpecialCompressionAllowed(const SubstreamPath & path)
{
@ -147,6 +235,102 @@ void IDataType::insertDefaultInto(IColumn & column) const
column.insertDefault();
}
void IDataType::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
if (custom_streams)
custom_streams->enumerateStreams(callback, path);
else
enumerateStreamsImpl(callback, path);
}
void IDataType::serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
if (custom_streams)
custom_streams->serializeBinaryBulkStatePrefix(settings, state);
else
serializeBinaryBulkStatePrefixImpl(settings, state);
}
void IDataType::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
if (custom_streams)
custom_streams->serializeBinaryBulkStateSuffix(settings, state);
else
serializeBinaryBulkStateSuffixImpl(settings, state);
}
void IDataType::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
if (custom_streams)
custom_streams->deserializeBinaryBulkStatePrefix(settings, state);
else
deserializeBinaryBulkStatePrefixImpl(settings, state);
}
void IDataType::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
if (custom_streams)
custom_streams->serializeBinaryBulkWithMultipleStreams(column, offset, limit, settings, state);
else
serializeBinaryBulkWithMultipleStreamsImpl(column, offset, limit, settings, state);
}
void IDataType::deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & /* state */,
SubstreamsCache * /* cache */) const
{
if (ReadBuffer * stream = settings.getter(settings.path))
deserializeBinaryBulk(column, *stream, limit, settings.avg_value_size_hint);
}
void IDataType::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
if (custom_streams)
{
custom_streams->deserializeBinaryBulkWithMultipleStreams(column, limit, settings, state, cache);
return;
}
/// Do not cache complex type, because they can be constructed
/// from their subcolumns, which are in cache.
if (!haveSubtypes())
{
auto cached_column = getFromSubstreamsCache(cache, settings.path);
if (cached_column)
{
column = cached_column;
return;
}
}
auto mutable_column = column->assumeMutable();
deserializeBinaryBulkWithMultipleStreamsImpl(*mutable_column, limit, settings, state, cache);
column = std::move(mutable_column);
if (!haveSubtypes())
addToSubstreamsCache(cache, settings.path, column);
}
void IDataType::serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (custom_text_serialization)
@ -243,6 +427,27 @@ void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const
if (custom_desc_->text_serialization)
custom_text_serialization = std::move(custom_desc_->text_serialization);
if (custom_desc_->streams)
custom_streams = std::move(custom_desc_->streams);
}
void IDataType::addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column)
{
if (cache && !path.empty())
cache->emplace(getSubcolumnNameForStream(path), column);
}
ColumnPtr IDataType::getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path)
{
if (!cache || path.empty())
return nullptr;
auto it = cache->find(getSubcolumnNameForStream(path));
if (it == cache->end())
return nullptr;
return it->second;
}
}

View File

@ -3,7 +3,9 @@
#include <memory>
#include <Common/COW.h>
#include <boost/noncopyable.hpp>
#include <DataTypes/DataTypeCustom.h>
#include <Core/Names.h>
#include <Core/Types.h>
#include <DataTypes/DataTypeCustom_fwd.h>
namespace DB
@ -27,6 +29,8 @@ using DataTypes = std::vector<DataTypePtr>;
class ProtobufReader;
class ProtobufWriter;
struct NameAndTypePair;
/** Properties of data type.
* Contains methods for serialization/deserialization.
@ -91,30 +95,42 @@ public:
TupleElement,
MapElement,
DictionaryKeys,
DictionaryIndexes,
};
Type type;
/// Index of tuple element, starting at 1.
/// Index of tuple element, starting at 1 or name.
String tuple_element_name;
/// Do we need to escape a dot in filenames for tuple elements.
bool escape_tuple_delimiter = true;
Substream(Type type_) : type(type_) {}
String toString() const;
};
using SubstreamPath = std::vector<Substream>;
struct SubstreamPath : public std::vector<Substream>
{
String toString() const;
};
/// Cache for common substreams of one type, but possible different its subcolumns.
/// E.g. sizes of arrays of Nested data type.
using SubstreamsCache = std::unordered_map<String, ColumnPtr>;
using StreamCallback = std::function<void(const SubstreamPath &, const IDataType &)>;
virtual void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
callback(path, *this);
}
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
virtual DataTypePtr tryGetSubcolumnType(const String & /* subcolumn_name */) const { return nullptr; }
DataTypePtr getSubcolumnType(const String & subcolumn_name) const;
virtual ColumnPtr getSubcolumn(const String & subcolumn_name, const IColumn & column) const;
Names getSubcolumnNames() const;
using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
@ -155,19 +171,19 @@ public:
};
/// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark.
virtual void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & /*settings*/,
SerializeBinaryBulkStatePtr & /*state*/) const {}
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const;
/// Call after serializeBinaryBulkWithMultipleStreams chain to finish serialization.
virtual void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & /*settings*/,
SerializeBinaryBulkStatePtr & /*state*/) const {}
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const;
/// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr.
virtual void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & /*settings*/,
DeserializeBinaryBulkStatePtr & /*state*/) const {}
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const;
/** 'offset' and 'limit' are used to specify range.
* limit = 0 - means no limit.
@ -175,27 +191,20 @@ public:
* offset + limit could be greater than size of column
* - in that case, column is serialized till the end.
*/
virtual void serializeBinaryBulkWithMultipleStreams(
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & /*state*/) const
{
if (WriteBuffer * stream = settings.getter(settings.path))
serializeBinaryBulk(column, *stream, offset, limit);
}
SerializeBinaryBulkStatePtr & state) const;
/// Read no more than limit values and append them into column.
virtual void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & /*state*/) const
{
if (ReadBuffer * stream = settings.getter(settings.path))
deserializeBinaryBulk(column, *stream, limit, settings.avg_value_size_hint);
}
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache = nullptr) const;
/** Override these methods for data types that require just single stream (most of data types).
*/
@ -268,6 +277,41 @@ public:
protected:
virtual String doGetName() const;
virtual void enumerateStreamsImpl(const StreamCallback & callback, SubstreamPath & path) const
{
callback(path, *this);
}
virtual void serializeBinaryBulkStatePrefixImpl(
SerializeBinaryBulkSettings & /*settings*/,
SerializeBinaryBulkStatePtr & /*state*/) const {}
virtual void serializeBinaryBulkStateSuffixImpl(
SerializeBinaryBulkSettings & /*settings*/,
SerializeBinaryBulkStatePtr & /*state*/) const {}
virtual void deserializeBinaryBulkStatePrefixImpl(
DeserializeBinaryBulkSettings & /*settings*/,
DeserializeBinaryBulkStatePtr & /*state*/) const {}
virtual void serializeBinaryBulkWithMultipleStreamsImpl(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & /*state*/) const
{
if (WriteBuffer * stream = settings.getter(settings.path))
serializeBinaryBulk(column, *stream, offset, limit);
}
virtual void deserializeBinaryBulkWithMultipleStreamsImpl(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const;
/// Default implementations of text serialization in case of 'custom_text_serialization' is not set.
virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0;
@ -286,6 +330,9 @@ protected:
}
public:
static void addToSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path, ColumnPtr column);
static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path);
/** Create empty column for corresponding type.
*/
virtual MutableColumnPtr createColumn() const = 0;
@ -443,7 +490,8 @@ public:
/// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column.
static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint);
static String getFileNameForStream(const String & column_name, const SubstreamPath & path);
static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path);
static String getSubcolumnNameForStream(const SubstreamPath & path);
/// Substream path supports special compression methods like codec Delta.
/// For all other substreams (like ArraySizes, NullMasks, etc.) we use only
@ -458,9 +506,11 @@ private:
/// This is mutable to allow setting custom name and serialization on `const IDataType` post construction.
mutable DataTypeCustomNamePtr custom_name;
mutable DataTypeCustomTextSerializationPtr custom_text_serialization;
mutable DataTypeCustomStreamsPtr custom_streams;
public:
const IDataTypeCustomName * getCustomName() const { return custom_name.get(); }
const IDataTypeCustomStreams * getCustomStreams() const { return custom_streams.get(); }
};

View File

@ -7,6 +7,7 @@
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeNested.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
@ -84,7 +85,8 @@ Block flatten(const Block & block)
for (const auto & elem : block)
{
if (const DataTypeArray * type_arr = typeid_cast<const DataTypeArray *>(elem.type.get()))
const DataTypeArray * type_arr = typeid_cast<const DataTypeArray *>(elem.type.get());
if (type_arr)
{
const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(type_arr->getNestedType().get());
if (type_tuple && type_tuple->haveExplicitNames())
@ -128,32 +130,67 @@ Block flatten(const Block & block)
return res;
}
namespace
{
using NameToDataType = std::map<String, DataTypePtr>;
NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types)
{
std::unordered_map<String, NamesAndTypesList> nested;
for (const auto & name_type : names_and_types)
{
const DataTypeArray * type_arr = typeid_cast<const DataTypeArray *>(name_type.type.get());
/// Ignore true Nested type, but try to unite flatten arrays to Nested type.
if (!isNested(name_type.type) && type_arr)
{
auto split = splitName(name_type.name);
if (!split.second.empty())
nested[split.first].emplace_back(split.second, type_arr->getNestedType());
}
}
std::map<String, DataTypePtr> nested_types;
for (const auto & [name, elems] : nested)
nested_types.emplace(name, createNested(elems.getTypes(), elems.getNames()));
return nested_types;
}
}
NamesAndTypesList collect(const NamesAndTypesList & names_and_types)
{
NamesAndTypesList res;
auto nested_types = getSubcolumnsOfNested(names_and_types);
std::map<std::string, NamesAndTypesList> nested;
for (const auto & name_type : names_and_types)
{
bool collected = false;
if (const DataTypeArray * type_arr = typeid_cast<const DataTypeArray *>(name_type.type.get()))
{
auto split = splitName(name_type.name);
if (!split.second.empty())
{
nested[split.first].emplace_back(split.second, type_arr->getNestedType());
collected = true;
}
}
if (!collected)
if (!nested_types.count(splitName(name_type.name).first))
res.push_back(name_type);
}
for (const auto & name_elems : nested)
res.emplace_back(name_elems.first, std::make_shared<DataTypeArray>(
std::make_shared<DataTypeTuple>(name_elems.second.getTypes(), name_elems.second.getNames())));
for (const auto & name_type : nested_types)
res.emplace_back(name_type.first, name_type.second);
return res;
}
NamesAndTypesList convertToSubcolumns(const NamesAndTypesList & names_and_types)
{
auto nested_types = getSubcolumnsOfNested(names_and_types);
auto res = names_and_types;
for (auto & name_type : res)
{
auto split = splitName(name_type.name);
if (name_type.isSubcolumn() || split.second.empty())
continue;
auto it = nested_types.find(split.first);
if (it != nested_types.end())
name_type = NameAndTypePair{split.first, split.second, it->second, it->second->getSubcolumnType(split.second)};
}
return res;
}

View File

@ -23,6 +23,9 @@ namespace Nested
/// Collect Array columns in a form of `column_name.element_name` to single Array(Tuple(...)) column.
NamesAndTypesList collect(const NamesAndTypesList & names_and_types);
/// Convert old-style nested (single arrays with same prefix, `n.a`, `n.b`...) to subcolumns of data type Nested.
NamesAndTypesList convertToSubcolumns(const NamesAndTypesList & names_and_types);
/// Check that sizes of arrays - elements of nested data structures - are equal.
void validateArraySizes(const Block & block);
}

View File

@ -28,9 +28,11 @@ SRCS(
DataTypeLowCardinality.cpp
DataTypeLowCardinalityHelpers.cpp
DataTypeMap.cpp
DataTypeNested.cpp
DataTypeNothing.cpp
DataTypeNullable.cpp
DataTypeNumberBase.cpp
DataTypeOneElementTuple.cpp
DataTypeString.cpp
DataTypeTuple.cpp
DataTypeUUID.cpp

View File

@ -23,11 +23,19 @@
# include <Databases/MySQL/DatabaseConnectionMySQL.h>
# include <Databases/MySQL/MaterializeMySQLSettings.h>
# include <Databases/MySQL/DatabaseMaterializeMySQL.h>
# include <Interpreters/evaluateConstantExpression.h>
# include <Common/parseAddress.h>
# include <mysqlxx/Pool.h>
#endif
#if USE_MYSQL || USE_LIBPQXX
#include <Interpreters/evaluateConstantExpression.h>
#include <Common/parseAddress.h>
#endif
#if USE_LIBPQXX
#include <Databases/PostgreSQL/DatabasePostgreSQL.h> // Y_IGNORE
#include <Storages/PostgreSQL/PostgreSQLConnection.h>
#endif
namespace DB
{
@ -80,7 +88,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
const String & engine_name = engine_define->engine->name;
const UUID & uuid = create.uuid;
if (engine_name != "MySQL" && engine_name != "MaterializeMySQL" && engine_name != "Lazy" && engine_define->engine->arguments)
if (engine_name != "MySQL" && engine_name != "MaterializeMySQL" && engine_name != "Lazy" && engine_name != "PostgreSQL" && engine_define->engine->arguments)
throw Exception("Database engine " + engine_name + " cannot have arguments", ErrorCodes::BAD_ARGUMENTS);
if (engine_define->engine->parameters || engine_define->partition_by || engine_define->primary_key || engine_define->order_by ||
@ -168,6 +176,44 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
return std::make_shared<DatabaseLazy>(database_name, metadata_path, cache_expiration_time_seconds, context);
}
#if USE_LIBPQXX
else if (engine_name == "PostgreSQL")
{
const ASTFunction * engine = engine_define->engine;
if (!engine->arguments || engine->arguments->children.size() < 4 || engine->arguments->children.size() > 5)
throw Exception(fmt::format(
"{} Database require host:port, database_name, username, password arguments "
"[, use_table_cache = 0].", engine_name),
ErrorCodes::BAD_ARGUMENTS);
ASTs & engine_args = engine->arguments->children;
for (auto & engine_arg : engine_args)
engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, context);
const auto & host_port = safeGetLiteralValue<String>(engine_args[0], engine_name);
const auto & postgres_database_name = safeGetLiteralValue<String>(engine_args[1], engine_name);
const auto & username = safeGetLiteralValue<String>(engine_args[2], engine_name);
const auto & password = safeGetLiteralValue<String>(engine_args[3], engine_name);
auto use_table_cache = 0;
if (engine->arguments->children.size() == 5)
use_table_cache = safeGetLiteralValue<UInt64>(engine_args[4], engine_name);
auto parsed_host_port = parseAddress(host_port, 5432);
/// no connection is made here
auto connection = std::make_shared<PostgreSQLConnection>(
postgres_database_name, parsed_host_port.first, parsed_host_port.second, username, password);
return std::make_shared<DatabasePostgreSQL>(
context, metadata_path, engine_define, database_name, postgres_database_name, connection, use_table_cache);
}
#endif
throw Exception("Unknown database engine: " + engine_name, ErrorCodes::UNKNOWN_DATABASE_ENGINE);
}

View File

@ -0,0 +1,415 @@
#include <Databases/PostgreSQL/DatabasePostgreSQL.h>
#if USE_LIBPQXX
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeArray.h>
#include <Storages/StoragePostgreSQL.h>
#include <Storages/PostgreSQL/PostgreSQLConnection.h>
#include <Interpreters/Context.h>
#include <Parsers/ASTCreateQuery.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ParserCreateQuery.h>
#include <Parsers/parseQuery.h>
#include <Parsers/queryToString.h>
#include <Common/escapeForFileName.h>
#include <Poco/DirectoryIterator.h>
#include <Poco/File.h>
#include <Databases/PostgreSQL/fetchPostgreSQLTableStructure.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int NOT_IMPLEMENTED;
extern const int UNKNOWN_TABLE;
extern const int TABLE_IS_DROPPED;
extern const int TABLE_ALREADY_EXISTS;
}
static const auto suffix = ".removed";
static const auto cleaner_reschedule_ms = 60000;
DatabasePostgreSQL::DatabasePostgreSQL(
const Context & context,
const String & metadata_path_,
const ASTStorage * database_engine_define_,
const String & dbname_,
const String & postgres_dbname,
PostgreSQLConnectionPtr connection_,
const bool cache_tables_)
: IDatabase(dbname_)
, global_context(context.getGlobalContext())
, metadata_path(metadata_path_)
, database_engine_define(database_engine_define_->clone())
, dbname(postgres_dbname)
, connection(std::move(connection_))
, cache_tables(cache_tables_)
{
cleaner_task = context.getSchedulePool().createTask("PostgreSQLCleanerTask", [this]{ removeOutdatedTables(); });
cleaner_task->deactivate();
}
bool DatabasePostgreSQL::empty() const
{
std::lock_guard<std::mutex> lock(mutex);
auto tables_list = fetchTablesList();
for (const auto & table_name : tables_list)
if (!detached_or_dropped.count(table_name))
return false;
return true;
}
DatabaseTablesIteratorPtr DatabasePostgreSQL::getTablesIterator(
const Context & context, const FilterByNameFunction & /* filter_by_table_name */)
{
std::lock_guard<std::mutex> lock(mutex);
Tables tables;
auto table_names = fetchTablesList();
for (const auto & table_name : table_names)
if (!detached_or_dropped.count(table_name))
tables[table_name] = fetchTable(table_name, context, true);
return std::make_unique<DatabaseTablesSnapshotIterator>(tables, database_name);
}
std::unordered_set<std::string> DatabasePostgreSQL::fetchTablesList() const
{
std::unordered_set<std::string> tables;
std::string query = "SELECT tablename FROM pg_catalog.pg_tables "
"WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema'";
pqxx::read_transaction tx(*connection->conn());
for (auto table_name : tx.stream<std::string>(query))
tables.insert(std::get<0>(table_name));
return tables;
}
bool DatabasePostgreSQL::checkPostgresTable(const String & table_name) const
{
if (table_name.find('\'') != std::string::npos
|| table_name.find('\\') != std::string::npos)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"PostgreSQL table name cannot contain single quote or backslash characters, passed {}", table_name);
}
pqxx::nontransaction tx(*connection->conn());
try
{
/// Casting table_name::regclass throws pqxx::indefined_table exception if table_name is incorrect.
pqxx::result result = tx.exec(fmt::format(
"SELECT '{}'::regclass, tablename "
"FROM pg_catalog.pg_tables "
"WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema' "
"AND tablename = '{}'", table_name, table_name));
}
catch (pqxx::undefined_table const &)
{
return false;
}
catch (Exception & e)
{
e.addMessage("while checking postgresql table existence");
throw;
}
return true;
}
bool DatabasePostgreSQL::isTableExist(const String & table_name, const Context & /* context */) const
{
std::lock_guard<std::mutex> lock(mutex);
if (detached_or_dropped.count(table_name))
return false;
return checkPostgresTable(table_name);
}
StoragePtr DatabasePostgreSQL::tryGetTable(const String & table_name, const Context & context) const
{
std::lock_guard<std::mutex> lock(mutex);
if (!detached_or_dropped.count(table_name))
return fetchTable(table_name, context, false);
return StoragePtr{};
}
StoragePtr DatabasePostgreSQL::fetchTable(const String & table_name, const Context & context, const bool table_checked) const
{
if (!cache_tables || !cached_tables.count(table_name))
{
if (!table_checked && !checkPostgresTable(table_name))
return StoragePtr{};
auto use_nulls = context.getSettingsRef().external_table_functions_use_nulls;
auto columns = fetchPostgreSQLTableStructure(connection->conn(), table_name, use_nulls);
if (!columns)
return StoragePtr{};
auto storage = StoragePostgreSQL::create(
StorageID(database_name, table_name), table_name, std::make_shared<PostgreSQLConnection>(connection->conn_str()),
ColumnsDescription{*columns}, ConstraintsDescription{}, context);
if (cache_tables)
cached_tables[table_name] = storage;
return storage;
}
if (table_checked || checkPostgresTable(table_name))
{
return cached_tables[table_name];
}
/// Table does not exist anymore
cached_tables.erase(table_name);
return StoragePtr{};
}
void DatabasePostgreSQL::attachTable(const String & table_name, const StoragePtr & storage, const String &)
{
std::lock_guard<std::mutex> lock{mutex};
if (!checkPostgresTable(table_name))
throw Exception(fmt::format("Cannot attach table {}.{} because it does not exist", database_name, table_name), ErrorCodes::UNKNOWN_TABLE);
if (!detached_or_dropped.count(table_name))
throw Exception(fmt::format("Cannot attach table {}.{}. It already exists", database_name, table_name), ErrorCodes::TABLE_ALREADY_EXISTS);
if (cache_tables)
cached_tables[table_name] = storage;
detached_or_dropped.erase(table_name);
Poco::File table_marked_as_removed(getMetadataPath() + '/' + escapeForFileName(table_name) + suffix);
if (table_marked_as_removed.exists())
table_marked_as_removed.remove();
}
StoragePtr DatabasePostgreSQL::detachTable(const String & table_name)
{
std::lock_guard<std::mutex> lock{mutex};
if (detached_or_dropped.count(table_name))
throw Exception(fmt::format("Cannot detach table {}.{}. It is already dropped/detached", database_name, table_name), ErrorCodes::TABLE_IS_DROPPED);
if (!checkPostgresTable(table_name))
throw Exception(fmt::format("Cannot detach table {}.{} because it does not exist", database_name, table_name), ErrorCodes::UNKNOWN_TABLE);
if (cache_tables)
cached_tables.erase(table_name);
detached_or_dropped.emplace(table_name);
/// not used anywhere (for postgres database)
return StoragePtr{};
}
void DatabasePostgreSQL::createTable(const Context &, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query)
{
const auto & create = create_query->as<ASTCreateQuery>();
if (!create->attach)
throw Exception("PostgreSQL database engine does not support create table", ErrorCodes::NOT_IMPLEMENTED);
attachTable(table_name, storage, {});
}
void DatabasePostgreSQL::dropTable(const Context &, const String & table_name, bool /* no_delay */)
{
std::lock_guard<std::mutex> lock{mutex};
if (!checkPostgresTable(table_name))
throw Exception(fmt::format("Cannot drop table {}.{} because it does not exist", database_name, table_name), ErrorCodes::UNKNOWN_TABLE);
if (detached_or_dropped.count(table_name))
throw Exception(fmt::format("Table {}.{} is already dropped/detached", database_name, table_name), ErrorCodes::TABLE_IS_DROPPED);
Poco::File mark_table_removed(getMetadataPath() + '/' + escapeForFileName(table_name) + suffix);
try
{
mark_table_removed.createFile();
}
catch (...)
{
throw;
}
if (cache_tables)
cached_tables.erase(table_name);
detached_or_dropped.emplace(table_name);
}
void DatabasePostgreSQL::drop(const Context & /*context*/)
{
Poco::File(getMetadataPath()).remove(true);
}
void DatabasePostgreSQL::loadStoredObjects(Context & /* context */, bool, bool /*force_attach*/)
{
{
std::lock_guard<std::mutex> lock{mutex};
Poco::DirectoryIterator iterator(getMetadataPath());
/// Check for previously dropped tables
for (Poco::DirectoryIterator end; iterator != end; ++iterator)
{
if (iterator->isFile() && endsWith(iterator.name(), suffix))
{
const auto & file_name = iterator.name();
const auto & table_name = unescapeForFileName(file_name.substr(0, file_name.size() - strlen(suffix)));
detached_or_dropped.emplace(table_name);
}
}
}
cleaner_task->activateAndSchedule();
}
void DatabasePostgreSQL::removeOutdatedTables()
{
std::lock_guard<std::mutex> lock{mutex};
auto actual_tables = fetchTablesList();
if (cache_tables)
{
/// (Tables are cached only after being accessed at least once)
for (auto iter = cached_tables.begin(); iter != cached_tables.end();)
{
if (!actual_tables.count(iter->first))
iter = cached_tables.erase(iter);
else
++iter;
}
}
for (auto iter = detached_or_dropped.begin(); iter != detached_or_dropped.end();)
{
if (!actual_tables.count(*iter))
{
auto table_name = *iter;
iter = detached_or_dropped.erase(iter);
Poco::File table_marked_as_removed(getMetadataPath() + '/' + escapeForFileName(table_name) + suffix);
if (table_marked_as_removed.exists())
table_marked_as_removed.remove();
}
else
++iter;
}
cleaner_task->scheduleAfter(cleaner_reschedule_ms);
}
void DatabasePostgreSQL::shutdown()
{
cleaner_task->deactivate();
}
ASTPtr DatabasePostgreSQL::getCreateDatabaseQuery() const
{
const auto & create_query = std::make_shared<ASTCreateQuery>();
create_query->database = getDatabaseName();
create_query->set(create_query->storage, database_engine_define);
return create_query;
}
ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, const Context & context, bool throw_on_error) const
{
auto storage = fetchTable(table_name, context, false);
if (!storage)
{
if (throw_on_error)
throw Exception(fmt::format("PostgreSQL table {}.{} does not exist", database_name, table_name), ErrorCodes::UNKNOWN_TABLE);
return nullptr;
}
auto create_table_query = std::make_shared<ASTCreateQuery>();
auto table_storage_define = database_engine_define->clone();
create_table_query->set(create_table_query->storage, table_storage_define);
auto columns_declare_list = std::make_shared<ASTColumns>();
auto columns_expression_list = std::make_shared<ASTExpressionList>();
columns_declare_list->set(columns_declare_list->columns, columns_expression_list);
create_table_query->set(create_table_query->columns_list, columns_declare_list);
/// init create query.
auto table_id = storage->getStorageID();
create_table_query->table = table_id.table_name;
create_table_query->database = table_id.database_name;
auto metadata_snapshot = storage->getInMemoryMetadataPtr();
for (const auto & column_type_and_name : metadata_snapshot->getColumns().getOrdinary())
{
const auto & column_declaration = std::make_shared<ASTColumnDeclaration>();
column_declaration->name = column_type_and_name.name;
column_declaration->type = getColumnDeclaration(column_type_and_name.type);
columns_expression_list->children.emplace_back(column_declaration);
}
ASTStorage * ast_storage = table_storage_define->as<ASTStorage>();
ASTs storage_children = ast_storage->children;
auto storage_engine_arguments = ast_storage->engine->arguments;
/// Remove extra engine argument (`use_table_cache`)
if (storage_engine_arguments->children.size() > 4)
storage_engine_arguments->children.resize(storage_engine_arguments->children.size() - 1);
/// Add table_name to engine arguments
assert(storage_engine_arguments->children.size() >= 2);
storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared<ASTLiteral>(table_id.table_name));
return create_table_query;
}
ASTPtr DatabasePostgreSQL::getColumnDeclaration(const DataTypePtr & data_type) const
{
WhichDataType which(data_type);
if (which.isNullable())
return makeASTFunction("Nullable", getColumnDeclaration(typeid_cast<const DataTypeNullable *>(data_type.get())->getNestedType()));
if (which.isArray())
return makeASTFunction("Array", getColumnDeclaration(typeid_cast<const DataTypeArray *>(data_type.get())->getNestedType()));
return std::make_shared<ASTIdentifier>(data_type->getName());
}
}
#endif

View File

@ -0,0 +1,91 @@
#pragma once
#if !defined(ARCADIA_BUILD)
#include "config_core.h"
#endif
#if USE_LIBPQXX
#include <Databases/DatabasesCommon.h>
#include <Core/BackgroundSchedulePool.h>
#include <Parsers/ASTCreateQuery.h>
namespace DB
{
class Context;
class PostgreSQLConnection;
using PostgreSQLConnectionPtr = std::shared_ptr<PostgreSQLConnection>;
/** Real-time access to table list and table structure from remote PostgreSQL.
* All tables are created after pull-out structure from remote PostgreSQL.
* If `cache_tables` == 1 (default: 0) table structure is cached and not checked for being modififed,
* but it will be updated during detach->attach.
*/
class DatabasePostgreSQL final : public IDatabase
{
public:
DatabasePostgreSQL(
const Context & context,
const String & metadata_path_,
const ASTStorage * database_engine_define,
const String & dbname_,
const String & postgres_dbname,
PostgreSQLConnectionPtr connection_,
const bool cache_tables_);
String getEngineName() const override { return "PostgreSQL"; }
String getMetadataPath() const override { return metadata_path; }
bool canContainMergeTreeTables() const override { return false; }
bool canContainDistributedTables() const override { return false; }
bool shouldBeEmptyOnDetach() const override { return false; }
ASTPtr getCreateDatabaseQuery() const override;
bool empty() const override;
void loadStoredObjects(Context &, bool, bool force_attach) override;
DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override;
bool isTableExist(const String & name, const Context & context) const override;
StoragePtr tryGetTable(const String & name, const Context & context) const override;
void createTable(const Context &, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) override;
void dropTable(const Context &, const String & table_name, bool no_delay) override;
void attachTable(const String & table_name, const StoragePtr & storage, const String & relative_table_path) override;
StoragePtr detachTable(const String & table_name) override;
void drop(const Context & /*context*/) override;
void shutdown() override;
protected:
ASTPtr getCreateTableQueryImpl(const String & table_name, const Context & context, bool throw_on_error) const override;
private:
const Context & global_context;
String metadata_path;
ASTPtr database_engine_define;
String dbname;
PostgreSQLConnectionPtr connection;
const bool cache_tables;
mutable Tables cached_tables;
std::unordered_set<std::string> detached_or_dropped;
BackgroundSchedulePool::TaskHolder cleaner_task;
bool checkPostgresTable(const String & table_name) const;
std::unordered_set<std::string> fetchTablesList() const;
StoragePtr fetchTable(const String & table_name, const Context & context, const bool table_checked) const;
void removeOutdatedTables();
ASTPtr getColumnDeclaration(const DataTypePtr & data_type) const;
};
}
#endif

View File

@ -0,0 +1,139 @@
#include <Databases/PostgreSQL/fetchPostgreSQLTableStructure.h>
#if USE_LIBPQXX
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime.h>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <pqxx/pqxx>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_TABLE;
extern const int BAD_ARGUMENTS;
}
static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullable, uint16_t dimensions)
{
DataTypePtr res;
/// Get rid of trailing '[]' for arrays
if (dimensions)
while (type.ends_with("[]"))
type.resize(type.size() - 2);
if (type == "smallint")
res = std::make_shared<DataTypeInt16>();
else if (type == "integer")
res = std::make_shared<DataTypeInt32>();
else if (type == "bigint")
res = std::make_shared<DataTypeInt64>();
else if (type == "real")
res = std::make_shared<DataTypeFloat32>();
else if (type == "double precision")
res = std::make_shared<DataTypeFloat64>();
else if (type == "serial")
res = std::make_shared<DataTypeUInt32>();
else if (type == "bigserial")
res = std::make_shared<DataTypeUInt64>();
else if (type.starts_with("timestamp"))
res = std::make_shared<DataTypeDateTime>();
else if (type == "date")
res = std::make_shared<DataTypeDate>();
else if (type.starts_with("numeric"))
{
/// Numeric and decimal will both end up here as numeric.
res = DataTypeFactory::instance().get(type);
uint32_t precision = getDecimalPrecision(*res);
uint32_t scale = getDecimalScale(*res);
if (precision <= DecimalUtils::maxPrecision<Decimal32>())
res = std::make_shared<DataTypeDecimal<Decimal32>>(precision, scale);
else if (precision <= DecimalUtils::maxPrecision<Decimal64>())
res = std::make_shared<DataTypeDecimal<Decimal64>>(precision, scale);
else if (precision <= DecimalUtils::maxPrecision<Decimal128>())
res = std::make_shared<DataTypeDecimal<Decimal128>>(precision, scale);
else if (precision <= DecimalUtils::maxPrecision<Decimal256>())
res = std::make_shared<DataTypeDecimal<Decimal256>>(precision, scale);
}
if (!res)
res = std::make_shared<DataTypeString>();
if (is_nullable)
res = std::make_shared<DataTypeNullable>(res);
while (dimensions--)
res = std::make_shared<DataTypeArray>(res);
return res;
}
std::shared_ptr<NamesAndTypesList> fetchPostgreSQLTableStructure(
std::shared_ptr<pqxx::connection> connection, const String & postgres_table_name, bool use_nulls)
{
auto columns = NamesAndTypesList();
if (postgres_table_name.find('\'') != std::string::npos
|| postgres_table_name.find('\\') != std::string::npos)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "PostgreSQL table name cannot contain single quote or backslash characters, passed {}",
postgres_table_name);
}
std::string query = fmt::format(
"SELECT attname AS name, format_type(atttypid, atttypmod) AS type, "
"attnotnull AS not_null, attndims AS dims "
"FROM pg_attribute "
"WHERE attrelid = '{}'::regclass "
"AND NOT attisdropped AND attnum > 0", postgres_table_name);
try
{
pqxx::read_transaction tx(*connection);
pqxx::stream_from stream(tx, pqxx::from_query, std::string_view(query));
std::tuple<std::string, std::string, std::string, uint16_t> row;
while (stream >> row)
{
columns.push_back(NameAndTypePair(
std::get<0>(row),
convertPostgreSQLDataType(
std::get<1>(row),
use_nulls && (std::get<2>(row) == "f"), /// 'f' means that postgres `not_null` is false, i.e. value is nullable
std::get<3>(row))));
}
stream.complete();
tx.commit();
}
catch (const pqxx::undefined_table &)
{
throw Exception(fmt::format(
"PostgreSQL table {}.{} does not exist",
connection->dbname(), postgres_table_name), ErrorCodes::UNKNOWN_TABLE);
}
catch (Exception & e)
{
e.addMessage("while fetching postgresql table structure");
throw;
}
if (columns.empty())
return nullptr;
return std::make_shared<NamesAndTypesList>(columns);
}
}
#endif

View File

@ -0,0 +1,19 @@
#pragma once
#if !defined(ARCADIA_BUILD)
#include "config_core.h"
#endif
#if USE_LIBPQXX
#include <Storages/StoragePostgreSQL.h>
namespace DB
{
std::shared_ptr<NamesAndTypesList> fetchPostgreSQLTableStructure(
std::shared_ptr<pqxx::connection> connection, const String & postgres_table_name, bool use_nulls);
}
#endif

View File

@ -8,7 +8,7 @@ PEERDIR(
SRCS(
<? find . -name '*.cpp' | sed 's/^\.\// /' | sort ?>
<? find . -name '*.cpp' | grep -v -F 'PostgreSQL' | sed 's/^\.\// /' | sort ?>
)
END()

View File

@ -0,0 +1,196 @@
#include "PostgreSQLDictionarySource.h"
#include <Poco/Util/AbstractConfiguration.h>
#include "DictionarySourceFactory.h"
#include "registerDictionaries.h"
#if USE_LIBPQXX
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <DataStreams/PostgreSQLBlockInputStream.h>
#include <Storages/PostgreSQL/PostgreSQLConnection.h>
#include "readInvalidateQuery.h"
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int SUPPORT_IS_DISABLED;
}
#if USE_LIBPQXX
static const UInt64 max_block_size = 8192;
PostgreSQLDictionarySource::PostgreSQLDictionarySource(
const DictionaryStructure & dict_struct_,
const Poco::Util::AbstractConfiguration & config_,
const std::string & config_prefix,
PostgreSQLConnectionPtr connection_,
const Block & sample_block_)
: dict_struct{dict_struct_}
, sample_block(sample_block_)
, connection(std::move(connection_))
, log(&Poco::Logger::get("PostgreSQLDictionarySource"))
, db(config_.getString(fmt::format("{}.db", config_prefix), ""))
, table(config_.getString(fmt::format("{}.table", config_prefix), ""))
, where(config_.getString(fmt::format("{}.where", config_prefix), ""))
, query_builder(dict_struct, "", "", table, where, IdentifierQuotingStyle::DoubleQuotes)
, load_all_query(query_builder.composeLoadAllQuery())
, invalidate_query(config_.getString(fmt::format("{}.invalidate_query", config_prefix), ""))
, update_field(config_.getString(fmt::format("{}.update_field", config_prefix), ""))
{
}
/// copy-constructor is provided in order to support cloneability
PostgreSQLDictionarySource::PostgreSQLDictionarySource(const PostgreSQLDictionarySource & other)
: dict_struct(other.dict_struct)
, sample_block(other.sample_block)
, connection(std::make_shared<PostgreSQLConnection>(other.connection->conn_str()))
, log(&Poco::Logger::get("PostgreSQLDictionarySource"))
, db(other.db)
, table(other.table)
, where(other.where)
, query_builder(dict_struct, "", "", table, where, IdentifierQuotingStyle::DoubleQuotes)
, load_all_query(query_builder.composeLoadAllQuery())
, invalidate_query(other.invalidate_query)
, update_time(other.update_time)
, update_field(other.update_field)
, invalidate_query_response(other.invalidate_query_response)
{
}
BlockInputStreamPtr PostgreSQLDictionarySource::loadAll()
{
LOG_TRACE(log, load_all_query);
return std::make_shared<PostgreSQLBlockInputStream>(
connection->conn(), load_all_query, sample_block, max_block_size);
}
BlockInputStreamPtr PostgreSQLDictionarySource::loadUpdatedAll()
{
auto load_update_query = getUpdateFieldAndDate();
LOG_TRACE(log, load_update_query);
return std::make_shared<PostgreSQLBlockInputStream>(connection->conn(), load_update_query, sample_block, max_block_size);
}
BlockInputStreamPtr PostgreSQLDictionarySource::loadIds(const std::vector<UInt64> & ids)
{
const auto query = query_builder.composeLoadIdsQuery(ids);
return std::make_shared<PostgreSQLBlockInputStream>(connection->conn(), query, sample_block, max_block_size);
}
BlockInputStreamPtr PostgreSQLDictionarySource::loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows)
{
const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN);
return std::make_shared<PostgreSQLBlockInputStream>(connection->conn(), query, sample_block, max_block_size);
}
bool PostgreSQLDictionarySource::isModified() const
{
if (!invalidate_query.empty())
{
auto response = doInvalidateQuery(invalidate_query);
if (response == invalidate_query_response)
return false;
invalidate_query_response = response;
}
return true;
}
std::string PostgreSQLDictionarySource::doInvalidateQuery(const std::string & request) const
{
Block invalidate_sample_block;
ColumnPtr column(ColumnString::create());
invalidate_sample_block.insert(ColumnWithTypeAndName(column, std::make_shared<DataTypeString>(), "Sample Block"));
PostgreSQLBlockInputStream block_input_stream(connection->conn(), request, invalidate_sample_block, 1);
return readInvalidateQuery(block_input_stream);
}
bool PostgreSQLDictionarySource::hasUpdateField() const
{
return !update_field.empty();
}
std::string PostgreSQLDictionarySource::getUpdateFieldAndDate()
{
if (update_time != std::chrono::system_clock::from_time_t(0))
{
auto tmp_time = update_time;
update_time = std::chrono::system_clock::now();
time_t hr_time = std::chrono::system_clock::to_time_t(tmp_time) - 1;
std::string str_time = std::to_string(LocalDateTime(hr_time));
return query_builder.composeUpdateQuery(update_field, str_time);
}
else
{
update_time = std::chrono::system_clock::now();
return query_builder.composeLoadAllQuery();
}
}
bool PostgreSQLDictionarySource::supportsSelectiveLoad() const
{
return true;
}
DictionarySourcePtr PostgreSQLDictionarySource::clone() const
{
return std::make_unique<PostgreSQLDictionarySource>(*this);
}
std::string PostgreSQLDictionarySource::toString() const
{
return "PostgreSQL: " + db + '.' + table + (where.empty() ? "" : ", where: " + where);
}
#endif
void registerDictionarySourcePostgreSQL(DictionarySourceFactory & factory)
{
auto create_table_source = [=](const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & root_config_prefix,
Block & sample_block,
const Context & /* context */,
const std::string & /* default_database */,
bool /* check_config */) -> DictionarySourcePtr
{
#if USE_LIBPQXX
const auto config_prefix = root_config_prefix + ".postgresql";
auto connection = std::make_shared<PostgreSQLConnection>(
config.getString(fmt::format("{}.db", config_prefix), ""),
config.getString(fmt::format("{}.host", config_prefix), ""),
config.getUInt(fmt::format("{}.port", config_prefix), 0),
config.getString(fmt::format("{}.user", config_prefix), ""),
config.getString(fmt::format("{}.password", config_prefix), ""));
return std::make_unique<PostgreSQLDictionarySource>(
dict_struct, config, config_prefix, connection, sample_block);
#else
(void)dict_struct;
(void)config;
(void)root_config_prefix;
(void)sample_block;
throw Exception{"Dictionary source of type `postgresql` is disabled because ClickHouse was built without postgresql support.",
ErrorCodes::SUPPORT_IS_DISABLED};
#endif
};
factory.registerSource("postgresql", create_table_source);
}
}

View File

@ -0,0 +1,70 @@
#pragma once
#if !defined(ARCADIA_BUILD)
#include "config_core.h"
#endif
#include "DictionaryStructure.h"
#include "IDictionarySource.h"
#if USE_LIBPQXX
#include "ExternalQueryBuilder.h"
#include <Core/Block.h>
#include <common/LocalDateTime.h>
#include <common/logger_useful.h>
#include <Storages/StoragePostgreSQL.h>
#include <pqxx/pqxx>
namespace DB
{
/// Allows loading dictionaries from a PostgreSQL database
class PostgreSQLDictionarySource final : public IDictionarySource
{
public:
PostgreSQLDictionarySource(
const DictionaryStructure & dict_struct_,
const Poco::Util::AbstractConfiguration & config_,
const std::string & config_prefix,
PostgreSQLConnectionPtr connection_,
const Block & sample_block_);
/// copy-constructor is provided in order to support cloneability
PostgreSQLDictionarySource(const PostgreSQLDictionarySource & other);
PostgreSQLDictionarySource & operator=(const PostgreSQLDictionarySource &) = delete;
BlockInputStreamPtr loadAll() override;
BlockInputStreamPtr loadUpdatedAll() override;
BlockInputStreamPtr loadIds(const std::vector<UInt64> & ids) override;
BlockInputStreamPtr loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows) override;
bool isModified() const override;
bool supportsSelectiveLoad() const override;
bool hasUpdateField() const override;
DictionarySourcePtr clone() const override;
std::string toString() const override;
private:
std::string getUpdateFieldAndDate();
std::string doInvalidateQuery(const std::string & request) const;
const DictionaryStructure dict_struct;
Block sample_block;
PostgreSQLConnectionPtr connection;
Poco::Logger * log;
const std::string db;
const std::string table;
const std::string where;
ExternalQueryBuilder query_builder;
const std::string load_all_query;
std::string invalidate_query;
std::chrono::time_point<std::chrono::system_clock> update_time;
const std::string update_field;
mutable std::string invalidate_query_response;
};
}
#endif

View File

@ -1644,6 +1644,8 @@ void SSDComplexKeyCacheDictionary::has(
const DataTypes & key_types,
PaddedPODArray<UInt8> & out) const
{
dict_struct.validateKeyTypes(key_types);
const auto now = std::chrono::system_clock::now();
std::unordered_map<KeyRef, std::vector<size_t>> not_found_keys;

View File

@ -4,6 +4,40 @@
namespace DB
{
class DictionarySourceFactory;
void registerDictionarySourceFile(DictionarySourceFactory & source_factory);
void registerDictionarySourceMysql(DictionarySourceFactory & source_factory);
void registerDictionarySourceClickHouse(DictionarySourceFactory & source_factory);
void registerDictionarySourceMongoDB(DictionarySourceFactory & source_factory);
void registerDictionarySourceCassandra(DictionarySourceFactory & source_factory);
void registerDictionarySourceRedis(DictionarySourceFactory & source_factory);
void registerDictionarySourceXDBC(DictionarySourceFactory & source_factory);
void registerDictionarySourceJDBC(DictionarySourceFactory & source_factory);
#if !defined(ARCADIA_BUILD)
void registerDictionarySourcePostgreSQL(DictionarySourceFactory & source_factory);
#endif
void registerDictionarySourceExecutable(DictionarySourceFactory & source_factory);
void registerDictionarySourceHTTP(DictionarySourceFactory & source_factory);
void registerDictionarySourceLibrary(DictionarySourceFactory & source_factory);
class DictionaryFactory;
void registerDictionaryRangeHashed(DictionaryFactory & factory);
void registerDictionaryComplexKeyHashed(DictionaryFactory & factory);
void registerDictionaryComplexKeyCache(DictionaryFactory & factory);
void registerDictionaryComplexKeyDirect(DictionaryFactory & factory);
void registerDictionaryTrie(DictionaryFactory & factory);
void registerDictionaryFlat(DictionaryFactory & factory);
void registerDictionaryHashed(DictionaryFactory & factory);
void registerDictionaryCache(DictionaryFactory & factory);
#if defined(__linux__) || defined(__FreeBSD__)
void registerDictionarySSDCache(DictionaryFactory & factory);
void registerDictionarySSDComplexKeyCache(DictionaryFactory & factory);
#endif
void registerDictionaryPolygon(DictionaryFactory & factory);
void registerDictionaryDirect(DictionaryFactory & factory);
void registerDictionaries()
{
{
@ -16,6 +50,9 @@ void registerDictionaries()
registerDictionarySourceCassandra(source_factory);
registerDictionarySourceXDBC(source_factory);
registerDictionarySourceJDBC(source_factory);
#if !defined(ARCADIA_BUILD)
registerDictionarySourcePostgreSQL(source_factory);
#endif
registerDictionarySourceExecutable(source_factory);
registerDictionarySourceHTTP(source_factory);
registerDictionarySourceLibrary(source_factory);

View File

@ -2,36 +2,5 @@
namespace DB
{
class DictionarySourceFactory;
void registerDictionarySourceFile(DictionarySourceFactory & source_factory);
void registerDictionarySourceMysql(DictionarySourceFactory & source_factory);
void registerDictionarySourceClickHouse(DictionarySourceFactory & source_factory);
void registerDictionarySourceMongoDB(DictionarySourceFactory & source_factory);
void registerDictionarySourceCassandra(DictionarySourceFactory & source_factory);
void registerDictionarySourceRedis(DictionarySourceFactory & source_factory);
void registerDictionarySourceXDBC(DictionarySourceFactory & source_factory);
void registerDictionarySourceJDBC(DictionarySourceFactory & source_factory);
void registerDictionarySourceExecutable(DictionarySourceFactory & source_factory);
void registerDictionarySourceHTTP(DictionarySourceFactory & source_factory);
void registerDictionarySourceLibrary(DictionarySourceFactory & source_factory);
class DictionaryFactory;
void registerDictionaryRangeHashed(DictionaryFactory & factory);
void registerDictionaryComplexKeyHashed(DictionaryFactory & factory);
void registerDictionaryComplexKeyCache(DictionaryFactory & factory);
void registerDictionaryComplexKeyDirect(DictionaryFactory & factory);
void registerDictionaryTrie(DictionaryFactory & factory);
void registerDictionaryFlat(DictionaryFactory & factory);
void registerDictionaryHashed(DictionaryFactory & factory);
void registerDictionaryCache(DictionaryFactory & factory);
#if defined(__linux__) || defined(__FreeBSD__)
void registerDictionarySSDCache(DictionaryFactory & factory);
void registerDictionarySSDComplexKeyCache(DictionaryFactory & factory);
#endif
void registerDictionaryPolygon(DictionaryFactory & factory);
void registerDictionaryDirect(DictionaryFactory & factory);
void registerDictionaries();
}

View File

@ -15,7 +15,7 @@ NO_COMPILER_WARNINGS()
SRCS(
<? find . -name '*.cpp' | grep -v -F tests | grep -v -F Trie | sed 's/^\.\// /' | sort ?>
<? find . -name '*.cpp' | grep -v -P 'tests|PostgreSQL' | sed 's/^\.\// /' | sort ?>
)
END()

View File

@ -23,6 +23,7 @@ namespace DB
namespace ErrorCodes
{
extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
extern const int NOT_IMPLEMENTED;
}
MySQLBlockInputStream::Connection::Connection(
@ -114,6 +115,8 @@ namespace
case ValueType::vtFixedString:
assert_cast<ColumnFixedString &>(column).insertData(value.data(), value.size());
break;
default:
throw Exception("Unsupported value type", ErrorCodes::NOT_IMPLEMENTED);
}
}

View File

@ -465,7 +465,8 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription(
res.add(std::move(column));
}
res.flattenNested();
if (context.getSettingsRef().flatten_nested)
res.flattenNested();
if (res.getAllPhysical().empty())
throw Exception{"Cannot CREATE table without physical columns", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED};

View File

@ -13,6 +13,7 @@
#include <Parsers/parseQuery.h>
#include <Access/AccessFlags.h>
#include <Access/ContextAccess.h>
#include <AggregateFunctions/AggregateFunctionCount.h>
@ -100,6 +101,7 @@ namespace ErrorCodes
extern const int PARAMETER_OUT_OF_BOUND;
extern const int INVALID_LIMIT_EXPRESSION;
extern const int INVALID_WITH_FILL_EXPRESSION;
extern const int ACCESS_DENIED;
}
/// Assumes `storage` is set and the table filter (row-level security) is not empty.
@ -212,6 +214,36 @@ static void rewriteMultipleJoins(ASTPtr & query, const TablesWithColumns & table
JoinToSubqueryTransformVisitor(join_to_subs_data).visit(query);
}
/// Checks that the current user has the SELECT privilege.
static void checkAccessRightsForSelect(
const Context & context,
const StorageID & table_id,
const StorageMetadataPtr & table_metadata,
const Strings & required_columns,
const TreeRewriterResult & syntax_analyzer_result)
{
if (!syntax_analyzer_result.has_explicit_columns && table_metadata && !table_metadata->getColumns().empty())
{
/// For a trivial query like "SELECT count() FROM table" access is granted if at least
/// one column is accessible.
/// In this case just checking access for `required_columns` doesn't work correctly
/// because `required_columns` will contain the name of a column of minimum size (see TreeRewriterResult::collectUsedColumns())
/// which is probably not the same column as the column the current user has access to.
auto access = context.getAccess();
for (const auto & column : table_metadata->getColumns())
{
if (access->isGranted(AccessType::SELECT, table_id.database_name, table_id.table_name, column.name))
return;
}
throw Exception(context.getUserName() + ": Not enough privileges. "
"To execute this query it's necessary to have grant SELECT for at least one column on " + table_id.getFullTableName(),
ErrorCodes::ACCESS_DENIED);
}
/// General check.
context.checkAccess(AccessType::SELECT, table_id, required_columns);
}
/// Returns true if we should ignore quotas and limits for a specified table in the system database.
static bool shouldIgnoreQuotaAndLimits(const StorageID & table_id)
{
@ -467,7 +499,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
{
/// The current user should have the SELECT privilege.
/// If this table_id is for a table function we don't check access rights here because in this case they have been already checked in ITableFunction::execute().
context->checkAccess(AccessType::SELECT, table_id, required_columns);
checkAccessRightsForSelect(*context, table_id, metadata_snapshot, required_columns, *syntax_analyzer_result);
/// Remove limits for some tables in the `system` database.
if (shouldIgnoreQuotaAndLimits(table_id) && (joined_tables.tablesCount() <= 1))

View File

@ -526,7 +526,12 @@ void TreeRewriterResult::collectSourceColumns(bool add_special)
{
const ColumnsDescription & columns = metadata_snapshot->getColumns();
auto columns_from_storage = add_special ? columns.getAll() : columns.getAllPhysical();
NamesAndTypesList columns_from_storage;
if (storage->supportsSubcolumns())
columns_from_storage = add_special ? columns.getAllWithSubcolumns() : columns.getAllPhysicalWithSubcolumns();
else
columns_from_storage = add_special ? columns.getAll() : columns.getAllPhysical();
if (source_columns.empty())
source_columns.swap(columns_from_storage);
else
@ -590,11 +595,13 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
required.insert(column_name_type.name);
}
/// You need to read at least one column to find the number of rows.
if (is_select && required.empty())
/// Figure out if we're able to use the trivial count optimization.
has_explicit_columns = !required.empty();
if (is_select && !has_explicit_columns)
{
optimize_trivial_count = true;
/// You need to read at least one column to find the number of rows.
/// We will find a column with minimum <compressed_size, type_size, uncompressed_size>.
/// Because it is the column that is cheapest to read.
struct ColumnSizeTuple

View File

@ -53,6 +53,13 @@ struct TreeRewriterResult
/// Predicate optimizer overrides the sub queries
bool rewrite_subqueries = false;
/// Whether the query contains explicit columns like "SELECT column1 + column2 FROM table1".
/// Queries like "SELECT count() FROM table1", "SELECT 1" don't contain explicit columns.
bool has_explicit_columns = false;
/// Whether it's possible to use the trivial count optimization,
/// i.e. use a fast call of IStorage::totalRows() (or IStorage::totalRowsByPartitionPredicate())
/// instead of actual retrieving columns and counting rows.
bool optimize_trivial_count = false;
/// Cache isRemote() call for storage, because it may be too heavy.

View File

@ -158,8 +158,8 @@ SRCS(
interpretSubquery.cpp
join_common.cpp
loadMetadata.cpp
replaceAliasColumnsInQuery.cpp
processColumnTransformers.cpp
replaceAliasColumnsInQuery.cpp
sortBlock.cpp
)

View File

@ -1,4 +1,5 @@
#include <Common/Exception.h>
#include <common/logger_useful.h>
#include <Parsers/New/LexerErrorListener.h>
@ -17,7 +18,7 @@ extern int SYNTAX_ERROR;
void LexerErrorListener::syntaxError(Recognizer *, Token *, size_t, size_t, const std::string & message, std::exception_ptr)
{
std::cerr << "Lexer error: " << message << std::endl;
LOG_ERROR(&Poco::Logger::get("ClickHouseLexer"), "Lexer error: {}", message);
throw DB::Exception("Can't recognize input: " + message, ErrorCodes::SYNTAX_ERROR);
}

View File

@ -1,4 +1,5 @@
#include <Common/Exception.h>
#include <common/logger_useful.h>
#include <Parsers/New/ParserErrorListener.h>
@ -24,9 +25,10 @@ void ParserErrorListener::syntaxError(
{
auto * parser = dynamic_cast<ClickHouseParser*>(recognizer);
std::cerr << "Last element parsed so far:" << std::endl
<< parser->getRuleContext()->toStringTree(parser, true) << std::endl
<< "Parser error: (pos " << token->getStartIndex() << ") " << message << std::endl;
LOG_ERROR(&Poco::Logger::get("ClickHouseParser"),
"Last element parsed so far:\n"
"{}\n"
"Parser error: (pos {}) {}", parser->getRuleContext()->toStringTree(parser, true), token->getStartIndex(), message);
throw DB::Exception("Can't parse input: " + message, ErrorCodes::SYNTAX_ERROR);
}

12
src/Parsers/New/README.md Normal file
View File

@ -0,0 +1,12 @@
## How to generate source code files from grammar
Grammar is located inside `ClickHouseLexer.g4` and `ClickHouseParser.g4` files.
To generate source code you need to install locally the `antlr4` binary:
```
cd src/Parsers/New
antlr4 -no-listener -visitor -package DB -Dlanguage=Cpp ClickHouseLexer.g4 # if you have changes in a lexer part of grammar
antlr4 -no-listener -visitor -package DB -Dlanguage=Cpp ClickHouseParser.g4
```
Commit only git-tracked generated files - not all of the generated content is required.

View File

@ -320,7 +320,8 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, const Context & con
metadata.columns.add(column, after_column, first);
/// Slow, because each time a list is copied
metadata.columns.flattenNested();
if (context.getSettingsRef().flatten_nested)
metadata.columns.flattenNested();
}
else if (type == DROP_COLUMN)
{

View File

@ -19,6 +19,7 @@
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeNested.h>
#include <Common/Exception.h>
#include <Interpreters/Context.h>
#include <Storages/IStorage.h>
@ -184,6 +185,7 @@ void ColumnsDescription::add(ColumnDescription column, const String & after_colu
insert_it = range.second;
}
addSubcolumns(column.name, column.type);
columns.get<0>().insert(insert_it, std::move(column));
}
@ -195,7 +197,10 @@ void ColumnsDescription::remove(const String & column_name)
ErrorCodes::NO_SUCH_COLUMN_IN_TABLE);
for (auto list_it = range.first; list_it != range.second;)
{
removeSubcolumns(list_it->name, list_it->type);
list_it = columns.get<0>().erase(list_it);
}
}
void ColumnsDescription::rename(const String & column_from, const String & column_to)
@ -268,6 +273,7 @@ void ColumnsDescription::flattenNested()
}
ColumnDescription column = std::move(*it);
removeSubcolumns(column.name, column.type);
it = columns.get<0>().erase(it);
const DataTypes & elements = type_tuple->getElements();
@ -281,6 +287,7 @@ void ColumnsDescription::flattenNested()
nested_column.name = Nested::concatenateName(column.name, names[i]);
nested_column.type = std::make_shared<DataTypeArray>(elements[i]);
addSubcolumns(nested_column.name, nested_column.type);
columns.get<0>().insert(it, std::move(nested_column));
}
}
@ -322,10 +329,10 @@ NamesAndTypesList ColumnsDescription::getAll() const
return ret;
}
bool ColumnsDescription::has(const String & column_name) const
{
return columns.get<1>().find(column_name) != columns.get<1>().end();
return columns.get<1>().find(column_name) != columns.get<1>().end()
|| subcolumns.find(column_name) != subcolumns.end();
}
bool ColumnsDescription::hasNested(const String & column_name) const
@ -371,12 +378,56 @@ NameAndTypePair ColumnsDescription::getPhysical(const String & column_name) cons
return NameAndTypePair(it->name, it->type);
}
NameAndTypePair ColumnsDescription::getPhysicalOrSubcolumn(const String & column_name) const
{
if (auto it = columns.get<1>().find(column_name); it != columns.get<1>().end()
&& it->default_desc.kind != ColumnDefaultKind::Alias)
{
return NameAndTypePair(it->name, it->type);
}
if (auto it = subcolumns.find(column_name); it != subcolumns.end())
{
return it->second;
}
throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE,
"There is no physical column or subcolumn {} in table.", column_name);
}
bool ColumnsDescription::hasPhysical(const String & column_name) const
{
auto it = columns.get<1>().find(column_name);
return it != columns.get<1>().end() && it->default_desc.kind != ColumnDefaultKind::Alias;
}
bool ColumnsDescription::hasPhysicalOrSubcolumn(const String & column_name) const
{
return hasPhysical(column_name) || subcolumns.find(column_name) != subcolumns.end();
}
static NamesAndTypesList getWithSubcolumns(NamesAndTypesList && source_list)
{
NamesAndTypesList ret;
for (const auto & col : source_list)
{
ret.emplace_back(col.name, col.type);
for (const auto & subcolumn : col.type->getSubcolumnNames())
ret.emplace_back(col.name, subcolumn, col.type, col.type->getSubcolumnType(subcolumn));
}
return ret;
}
NamesAndTypesList ColumnsDescription::getAllWithSubcolumns() const
{
return getWithSubcolumns(getAll());
}
NamesAndTypesList ColumnsDescription::getAllPhysicalWithSubcolumns() const
{
return getWithSubcolumns(getAllPhysical());
}
bool ColumnsDescription::hasDefaults() const
{
@ -483,13 +534,33 @@ ColumnsDescription ColumnsDescription::parse(const String & str)
ColumnDescription column;
column.readText(buf);
buf.ignore(1); /// ignore new line
result.add(std::move(column));
result.add(column);
}
assertEOF(buf);
return result;
}
void ColumnsDescription::addSubcolumns(const String & name_in_storage, const DataTypePtr & type_in_storage)
{
for (const auto & subcolumn_name : type_in_storage->getSubcolumnNames())
{
auto subcolumn = NameAndTypePair(name_in_storage, subcolumn_name,
type_in_storage, type_in_storage->getSubcolumnType(subcolumn_name));
if (has(subcolumn.name))
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
"Cannot add subcolumn {}: column with this name already exists", subcolumn.name);
subcolumns[subcolumn.name] = subcolumn;
}
}
void ColumnsDescription::removeSubcolumns(const String & name_in_storage, const DataTypePtr & type_in_storage)
{
for (const auto & subcolumn_name : type_in_storage->getSubcolumnNames())
subcolumns.erase(name_in_storage + "." + subcolumn_name);
}
Block validateColumnsDefaultsAndGetSampleBlock(ASTPtr default_expr_list, const NamesAndTypesList & all_columns, const Context & context)
{

View File

@ -77,6 +77,8 @@ public:
NamesAndTypesList getAliases() const;
NamesAndTypesList getAllPhysical() const; /// ordinary + materialized.
NamesAndTypesList getAll() const; /// ordinary + materialized + aliases
NamesAndTypesList getAllWithSubcolumns() const;
NamesAndTypesList getAllPhysicalWithSubcolumns() const;
using ColumnTTLs = std::unordered_map<String, ASTPtr>;
ColumnTTLs getColumnTTLs() const;
@ -105,7 +107,9 @@ public:
Names getNamesOfPhysical() const;
bool hasPhysical(const String & column_name) const;
bool hasPhysicalOrSubcolumn(const String & column_name) const;
NameAndTypePair getPhysical(const String & column_name) const;
NameAndTypePair getPhysicalOrSubcolumn(const String & column_name) const;
ColumnDefaults getDefaults() const; /// TODO: remove
bool hasDefault(const String & column_name) const;
@ -141,7 +145,12 @@ public:
private:
Container columns;
using SubcolumnsContainer = std::unordered_map<String, NameAndTypePair>;
SubcolumnsContainer subcolumns;
void modifyColumnOrder(const String & column_name, const String & after_column, bool first);
void addSubcolumns(const String & name_in_storage, const DataTypePtr & type_in_storage);
void removeSubcolumns(const String & name_in_storage, const DataTypePtr & type_in_storage);
};
/// Validate default expressions and corresponding types compatibility, i.e.

View File

@ -128,6 +128,9 @@ public:
/// Example is StorageSystemNumbers.
virtual bool hasEvenlyDistributedRead() const { return false; }
/// Returns true if the storage supports reading of subcolumns of complex types.
virtual bool supportsSubcolumns() const { return false; }
/// Optional size information of each physical column.
/// Currently it's only used by the MergeTree family for query optimizations.

View File

@ -17,6 +17,7 @@
#include <common/logger_useful.h>
#include <Compression/getCompressionCodecForFile.h>
#include <Parsers/queryToString.h>
#include <DataTypes/NestedUtils.h>
namespace CurrentMetrics
@ -321,7 +322,12 @@ void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns)
column_name_to_position.reserve(new_columns.size());
size_t pos = 0;
for (const auto & column : columns)
column_name_to_position.emplace(column.name, pos++);
{
column_name_to_position.emplace(column.name, pos);
for (const auto & subcolumn : column.type->getSubcolumnNames())
column_name_to_position.emplace(Nested::concatenateName(column.name, subcolumn), pos);
++pos;
}
}
void IMergeTreeDataPart::removeIfNeeded()
@ -454,7 +460,7 @@ String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(const StorageM
if (alter_conversions.isColumnRenamed(column.name))
column_name = alter_conversions.getColumnOldName(column.name);
if (!hasColumnFiles(column_name, *column_type))
if (!hasColumnFiles(column))
continue;
const auto size = getColumnSize(column_name, *column_type).data_compressed;
@ -640,7 +646,7 @@ CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const
{
if (path_to_data_file.empty())
{
String candidate_path = getFullRelativePath() + IDataType::getFileNameForStream(part_column.name, substream_path) + ".bin";
String candidate_path = getFullRelativePath() + IDataType::getFileNameForStream(part_column, substream_path) + ".bin";
/// We can have existing, but empty .bin files. Example: LowCardinality(Nullable(...)) columns and column_name.dict.null.bin file.
if (volume->getDisk()->exists(candidate_path) && volume->getDisk()->getFileSize(candidate_path) != 0)

View File

@ -330,7 +330,7 @@ public:
/// NOTE: Doesn't take column renames into account, if some column renames
/// take place, you must take original name of column for this part from
/// storage and pass it to this method.
virtual bool hasColumnFiles(const String & /* column */, const IDataType & /* type */) const { return false; }
virtual bool hasColumnFiles(const NameAndTypePair & /* column */) const { return false; }
/// Returns true if this part shall participate in merges according to
/// settings of given storage policy.

View File

@ -42,7 +42,14 @@ IMergeTreeReader::IMergeTreeReader(
, all_mark_ranges(all_mark_ranges_)
, alter_conversions(storage.getAlterConversionsForPart(data_part))
{
for (const NameAndTypePair & column_from_part : data_part->getColumns())
auto part_columns = data_part->getColumns();
if (settings.convert_nested_to_subcolumns)
{
columns = Nested::convertToSubcolumns(columns);
part_columns = Nested::collect(part_columns);
}
for (const NameAndTypePair & column_from_part : part_columns)
columns_from_part[column_from_part.name] = column_from_part.type;
}
@ -74,7 +81,6 @@ static bool arrayHasNoElementsRead(const IColumn & column)
return last_offset != 0;
}
void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows)
{
try
@ -197,19 +203,33 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
NameAndTypePair IMergeTreeReader::getColumnFromPart(const NameAndTypePair & required_column) const
{
if (alter_conversions.isColumnRenamed(required_column.name))
auto name_in_storage = required_column.getNameInStorage();
decltype(columns_from_part.begin()) it;
if (alter_conversions.isColumnRenamed(name_in_storage))
{
String old_name = alter_conversions.getColumnOldName(required_column.name);
auto it = columns_from_part.find(old_name);
if (it != columns_from_part.end())
return {it->first, it->second};
String old_name = alter_conversions.getColumnOldName(name_in_storage);
it = columns_from_part.find(old_name);
}
else if (auto it = columns_from_part.find(required_column.name); it != columns_from_part.end())
else
{
return {it->first, it->second};
it = columns_from_part.find(name_in_storage);
}
return required_column;
if (it == columns_from_part.end())
return required_column;
if (required_column.isSubcolumn())
{
auto subcolumn_name = required_column.getSubcolumnName();
auto subcolumn_type = it->second->tryGetSubcolumnType(subcolumn_name);
if (!subcolumn_type)
subcolumn_type = required_column.type;
return {it->first, subcolumn_name, it->second, subcolumn_type};
}
return {it->first, it->second};
}
void IMergeTreeReader::performRequiredConversions(Columns & res_columns)

View File

@ -33,7 +33,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
column.type->enumerateStreams(
[&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_path */)
{
++stream_counts[IDataType::getFileNameForStream(column.name, substream_path)];
++stream_counts[IDataType::getFileNameForStream(column, substream_path)];
},
{});
}
@ -42,9 +42,13 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
const String mrk_extension = data_part->getMarksFileExtension();
for (const auto & column_name : empty_columns)
{
auto column_with_type = columns.tryGetByName(column_name);
if (!column_with_type)
continue;
IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_path */)
{
String stream_name = IDataType::getFileNameForStream(column_name, substream_path);
String stream_name = IDataType::getFileNameForStream(*column_with_type, substream_path);
/// Delete files if they are no longer shared with another column.
if (--stream_counts[stream_name] == 0)
{
@ -52,10 +56,9 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
remove_files.emplace(stream_name + mrk_extension);
}
};
IDataType::SubstreamPath stream_path;
auto column_with_type = columns.tryGetByName(column_name);
if (column_with_type)
column_with_type->type->enumerateStreams(callback, stream_path);
column_with_type->type->enumerateStreams(callback, stream_path);
}
/// Remove files on disk and checksums

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Core/NamesAndTypes.h>
#include <Common/checkStackSize.h>
#include <Common/typeid_cast.h>
#include <Columns/ColumnConst.h>
@ -33,21 +34,30 @@ bool injectRequiredColumnsRecursively(
/// huge AST which for some reason was not validated on parsing/interpreter
/// stages.
checkStackSize();
String column_name_in_part = column_name;
if (alter_conversions.isColumnRenamed(column_name_in_part))
column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part);
/// column has files and hence does not require evaluation
if (storage_columns.hasPhysical(column_name) && part->hasColumnFiles(column_name_in_part, *storage_columns.getPhysical(column_name).type))
if (storage_columns.hasPhysicalOrSubcolumn(column_name))
{
/// ensure each column is added only once
if (required_columns.count(column_name) == 0)
auto column_in_storage = storage_columns.getPhysicalOrSubcolumn(column_name);
auto column_name_in_part = column_in_storage.getNameInStorage();
if (alter_conversions.isColumnRenamed(column_name_in_part))
column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part);
auto column_in_part = NameAndTypePair(
column_name_in_part, column_in_storage.getSubcolumnName(),
column_in_storage.getTypeInStorage(), column_in_storage.type);
/// column has files and hence does not require evaluation
if (part->hasColumnFiles(column_in_part))
{
columns.emplace_back(column_name);
required_columns.emplace(column_name);
injected_columns.emplace(column_name);
/// ensure each column is added only once
if (required_columns.count(column_name) == 0)
{
columns.emplace_back(column_name);
required_columns.emplace(column_name);
injected_columns.emplace(column_name);
}
return true;
}
return true;
}
/// Column doesn't have default value and don't exist in part
@ -81,8 +91,8 @@ NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetada
for (size_t i = 0; i < columns.size(); ++i)
{
/// We are going to fetch only physical columns
if (!storage_columns.hasPhysical(columns[i]))
throw Exception("There is no physical column " + columns[i] + " in table.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE);
if (!storage_columns.hasPhysicalOrSubcolumn(columns[i]))
throw Exception("There is no physical column or subcolumn " + columns[i] + " in table.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE);
have_at_least_one_physical_column |= injectRequiredColumnsRecursively(
columns[i], storage_columns, alter_conversions,
@ -285,7 +295,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(
if (check_columns)
{
const NamesAndTypesList & physical_columns = metadata_snapshot->getColumns().getAllPhysical();
const NamesAndTypesList & physical_columns = metadata_snapshot->getColumns().getAllWithSubcolumns();
result.pre_columns = physical_columns.addTypes(pre_column_names);
result.columns = physical_columns.addTypes(column_names);
}

View File

@ -357,6 +357,8 @@ public:
|| merging_params.mode == MergingParams::VersionedCollapsing;
}
bool supportsSubcolumns() const override { return true; }
NamesAndTypesList getVirtuals() const override;
bool mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, const Context &, const StorageMetadataPtr & metadata_snapshot) const override;

View File

@ -1493,7 +1493,7 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames(
column.type->enumerateStreams(
[&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
++stream_counts[IDataType::getFileNameForStream(column.name, substream_path)];
++stream_counts[IDataType::getFileNameForStream(column, substream_path)];
},
{});
}
@ -1511,7 +1511,7 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames(
{
IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
String stream_name = IDataType::getFileNameForStream(command.column_name, substream_path);
String stream_name = IDataType::getFileNameForStream({command.column_name, command.data_type}, substream_path);
/// Delete files if they are no longer shared with another column.
if (--stream_counts[stream_name] == 0)
{
@ -1532,7 +1532,7 @@ NameToNameVector MergeTreeDataMergerMutator::collectFilesForRenames(
IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
String stream_from = IDataType::getFileNameForStream(command.column_name, substream_path);
String stream_from = IDataType::getFileNameForStream({command.column_name, command.data_type}, substream_path);
String stream_to = boost::replace_first_copy(stream_from, escaped_name_from, escaped_name_to);
@ -1565,7 +1565,7 @@ NameSet MergeTreeDataMergerMutator::collectFilesToSkip(
{
IDataType::StreamCallback callback = [&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
String stream_name = IDataType::getFileNameForStream(entry.name, substream_path);
String stream_name = IDataType::getFileNameForStream({entry.name, entry.type}, substream_path);
files_to_skip.insert(stream_name + ".bin");
files_to_skip.insert(stream_name + mrk_extension);
};

View File

@ -1,4 +1,5 @@
#include "MergeTreeDataPartCompact.h"
#include <DataTypes/NestedUtils.h>
#include <Storages/MergeTree/MergeTreeReaderCompact.h>
#include <Storages/MergeTree/MergeTreeDataPartWriterCompact.h>
#include <Poco/File.h>
@ -121,9 +122,9 @@ void MergeTreeDataPartCompact::loadIndexGranularity()
index_granularity.setInitialized();
}
bool MergeTreeDataPartCompact::hasColumnFiles(const String & column_name, const IDataType &) const
bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) const
{
if (!getColumnPosition(column_name))
if (!getColumnPosition(column.name))
return false;
auto bin_checksum = checksums.files.find(DATA_FILE_NAME_WITH_EXTENSION);

View File

@ -55,7 +55,7 @@ public:
bool isStoredOnDisk() const override { return true; }
bool hasColumnFiles(const String & column_name, const IDataType & type) const override;
bool hasColumnFiles(const NameAndTypePair & column) const override;
String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return DATA_FILE_NAME; }

View File

@ -3,6 +3,7 @@
#include <Storages/MergeTree/MergedBlockOutputStream.h>
#include <Storages/MergeTree/MergeTreeDataPartWriterInMemory.h>
#include <Storages/MergeTree/IMergeTreeReader.h>
#include <DataTypes/NestedUtils.h>
#include <Interpreters/Context.h>
#include <Poco/File.h>
#include <Poco/Logger.h>

View File

@ -32,6 +32,7 @@ public:
const MergeTreeReaderSettings & reader_settings_,
const ValueSizeMap & avg_value_size_hints,
const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override;
MergeTreeWriterPtr getWriter(
const NamesAndTypesList & columns_list,
const StorageMetadataPtr & metadata_snapshot,
@ -41,7 +42,7 @@ public:
const MergeTreeIndexGranularity & computed_index_granularity) const override;
bool isStoredOnDisk() const override { return false; }
bool hasColumnFiles(const String & column_name, const IDataType & /* type */) const override { return !!getColumnPosition(column_name); }
bool hasColumnFiles(const NameAndTypePair & column) const override { return !!getColumnPosition(column.name); }
String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return ""; }
void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const override;
void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const override;

View File

@ -3,6 +3,8 @@
#include <Storages/MergeTree/MergeTreeReaderWide.h>
#include <Storages/MergeTree/MergeTreeDataPartWriterWide.h>
#include <Storages/MergeTree/IMergeTreeDataPartWriter.h>
#include <DataTypes/NestedUtils.h>
#include <Core/NamesAndTypes.h>
namespace DB
@ -46,10 +48,13 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader(
const ValueSizeMap & avg_value_size_hints,
const ReadBufferFromFileBase::ProfileCallback & profile_callback) const
{
auto new_settings = reader_settings;
new_settings.convert_nested_to_subcolumns = true;
auto ptr = std::static_pointer_cast<const MergeTreeDataPartWide>(shared_from_this());
return std::make_unique<MergeTreeReaderWide>(
ptr, columns_to_read, metadata_snapshot, uncompressed_cache,
mark_cache, mark_ranges, reader_settings,
mark_cache, mark_ranges, new_settings,
avg_value_size_hints, profile_callback);
}
@ -71,15 +76,15 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter(
/// Takes into account the fact that several columns can e.g. share their .size substreams.
/// When calculating totals these should be counted only once.
ColumnSize MergeTreeDataPartWide::getColumnSizeImpl(
const String & column_name, const IDataType & type, std::unordered_set<String> * processed_substreams) const
const NameAndTypePair & column, std::unordered_set<String> * processed_substreams) const
{
ColumnSize size;
if (checksums.empty())
return size;
type.enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
String file_name = IDataType::getFileNameForStream(column_name, substream_path);
String file_name = IDataType::getFileNameForStream(column, substream_path);
if (processed_substreams && !processed_substreams->insert(file_name).second)
return;
@ -157,7 +162,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
IDataType::SubstreamPath stream_path;
name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
String file_name = IDataType::getFileNameForStream(name_type.name, substream_path);
String file_name = IDataType::getFileNameForStream(name_type, substream_path);
String mrk_file_name = file_name + index_granularity_info.marks_file_extension;
String bin_file_name = file_name + ".bin";
if (!checksums.files.count(mrk_file_name))
@ -179,7 +184,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
{
name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
auto file_path = path + IDataType::getFileNameForStream(name_type.name, substream_path) + index_granularity_info.marks_file_extension;
auto file_path = path + IDataType::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension;
/// Missing file is Ok for case when new column was added.
if (volume->getDisk()->exists(file_path))
@ -201,13 +206,13 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
}
}
bool MergeTreeDataPartWide::hasColumnFiles(const String & column_name, const IDataType & type) const
bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const
{
bool res = true;
type.enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
String file_name = IDataType::getFileNameForStream(column_name, substream_path);
String file_name = IDataType::getFileNameForStream(column, substream_path);
auto bin_checksum = checksums.files.find(file_name + ".bin");
auto mrk_checksum = checksums.files.find(file_name + index_granularity_info.marks_file_extension);
@ -225,7 +230,7 @@ String MergeTreeDataPartWide::getFileNameForColumn(const NameAndTypePair & colum
column.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
if (filename.empty())
filename = IDataType::getFileNameForStream(column.name, substream_path);
filename = IDataType::getFileNameForStream(column, substream_path);
});
return filename;
}
@ -235,7 +240,7 @@ void MergeTreeDataPartWide::calculateEachColumnSizes(ColumnSizeByName & each_col
std::unordered_set<String> processed_substreams;
for (const NameAndTypePair & column : columns)
{
ColumnSize size = getColumnSizeImpl(column.name, *column.type, &processed_substreams);
ColumnSize size = getColumnSizeImpl(column, &processed_substreams);
each_columns_size[column.name] = size;
total_size.add(size);

View File

@ -54,7 +54,7 @@ public:
~MergeTreeDataPartWide() override;
bool hasColumnFiles(const String & column, const IDataType & type) const override;
bool hasColumnFiles(const NameAndTypePair & column) const override;
private:
void checkConsistency(bool require_part_metadata) const override;
@ -62,7 +62,7 @@ private:
/// Loads marks index granularity into memory
void loadIndexGranularity() override;
ColumnSize getColumnSizeImpl(const String & name, const IDataType & type, std::unordered_set<String> * processed_substreams) const;
ColumnSize getColumnSizeImpl(const NameAndTypePair & column, std::unordered_set<String> * processed_substreams) const;
void calculateEachColumnSizes(ColumnSizeByName & each_columns_size, ColumnSize & total_size) const override;
};

View File

@ -34,14 +34,14 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact(
{
const auto & storage_columns = metadata_snapshot->getColumns();
for (const auto & column : columns_list)
addStreams(column.name, *column.type, storage_columns.getCodecDescOrDefault(column.name, default_codec));
addStreams(column, storage_columns.getCodecDescOrDefault(column.name, default_codec));
}
void MergeTreeDataPartWriterCompact::addStreams(const String & name, const IDataType & type, const ASTPtr & effective_codec_desc)
void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc)
{
IDataType::StreamCallback callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & substream_type)
{
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(column, substream_path);
/// Shared offsets for Nested type.
if (compressed_streams.count(stream_name))
@ -64,7 +64,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const String & name, const IData
};
IDataType::SubstreamPath stream_path;
type.enumerateStreams(callback, stream_path);
column.type->enumerateStreams(callback, stream_path);
}
namespace
@ -183,7 +183,7 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G
CompressedStreamPtr prev_stream;
auto stream_getter = [&, this](const IDataType::SubstreamPath & substream_path) -> WriteBuffer *
{
String stream_name = IDataType::getFileNameForStream(name_and_type->name, substream_path);
String stream_name = IDataType::getFileNameForStream(*name_and_type, substream_path);
auto & result_stream = compressed_streams[stream_name];
/// Write one compressed block per column in granule for more optimal reading.

View File

@ -37,7 +37,7 @@ private:
void addToChecksums(MergeTreeDataPartChecksums & checksums);
void addStreams(const String & name, const IDataType & type, const ASTPtr & effective_codec_desc);
void addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc);
Block header;

View File

@ -80,17 +80,17 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide(
{
const auto & columns = metadata_snapshot->getColumns();
for (const auto & it : columns_list)
addStreams(it.name, *it.type, columns.getCodecDescOrDefault(it.name, default_codec));
addStreams(it, columns.getCodecDescOrDefault(it.name, default_codec));
}
void MergeTreeDataPartWriterWide::addStreams(
const String & name,
const IDataType & type,
const NameAndTypePair & column,
const ASTPtr & effective_codec_desc)
{
IDataType::StreamCallback callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & substream_type)
{
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(column, substream_path);
/// Shared offsets for Nested type.
if (column_streams.count(stream_name))
return;
@ -112,18 +112,18 @@ void MergeTreeDataPartWriterWide::addStreams(
};
IDataType::SubstreamPath stream_path;
type.enumerateStreams(callback, stream_path);
column.type->enumerateStreams(callback, stream_path);
}
IDataType::OutputStreamGetter MergeTreeDataPartWriterWide::createStreamGetter(
const String & name, WrittenOffsetColumns & offset_columns) const
const NameAndTypePair & column, WrittenOffsetColumns & offset_columns) const
{
return [&, this] (const IDataType::SubstreamPath & substream_path) -> WriteBuffer *
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(column, substream_path);
/// Don't write offsets more than one time for Nested type.
if (is_offsets && offset_columns.count(stream_name))
@ -210,23 +210,23 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm
if (primary_key_block.has(it->name))
{
const auto & primary_column = *primary_key_block.getByName(it->name).column;
writeColumn(column.name, *column.type, primary_column, offset_columns, granules_to_write);
writeColumn(*it, primary_column, offset_columns, granules_to_write);
}
else if (skip_indexes_block.has(it->name))
{
const auto & index_column = *skip_indexes_block.getByName(it->name).column;
writeColumn(column.name, *column.type, index_column, offset_columns, granules_to_write);
writeColumn(*it, index_column, offset_columns, granules_to_write);
}
else
{
/// We rearrange the columns that are not included in the primary key here; Then the result is released - to save RAM.
ColumnPtr permuted_column = column.column->permute(*permutation, 0);
writeColumn(column.name, *column.type, *permuted_column, offset_columns, granules_to_write);
writeColumn(*it, *permuted_column, offset_columns, granules_to_write);
}
}
else
{
writeColumn(column.name, *column.type, *column.column, offset_columns, granules_to_write);
writeColumn(*it, *column.column, offset_columns, granules_to_write);
}
}
@ -239,13 +239,12 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm
}
void MergeTreeDataPartWriterWide::writeSingleMark(
const String & name,
const IDataType & type,
const NameAndTypePair & column,
WrittenOffsetColumns & offset_columns,
size_t number_of_rows,
DB::IDataType::SubstreamPath & path)
{
StreamsWithMarks marks = getCurrentMarksForColumn(name, type, offset_columns, path);
StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns, path);
for (const auto & mark : marks)
flushMarkToFile(mark, number_of_rows);
}
@ -260,17 +259,16 @@ void MergeTreeDataPartWriterWide::flushMarkToFile(const StreamNameAndMark & stre
}
StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
const String & name,
const IDataType & type,
const NameAndTypePair & column,
WrittenOffsetColumns & offset_columns,
DB::IDataType::SubstreamPath & path)
{
StreamsWithMarks result;
type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
column.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(column, substream_path);
/// Don't write offsets more than one time for Nested type.
if (is_offsets && offset_columns.count(stream_name))
@ -294,22 +292,21 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
}
void MergeTreeDataPartWriterWide::writeSingleGranule(
const String & name,
const IDataType & type,
const NameAndTypePair & name_and_type,
const IColumn & column,
WrittenOffsetColumns & offset_columns,
IDataType::SerializeBinaryBulkStatePtr & serialization_state,
IDataType::SerializeBinaryBulkSettings & serialize_settings,
const Granule & granule)
{
type.serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state);
name_and_type.type->serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state);
/// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one.
type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
name_and_type.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path);
/// Don't write offsets more than one time for Nested type.
if (is_offsets && offset_columns.count(stream_name))
@ -321,27 +318,27 @@ void MergeTreeDataPartWriterWide::writeSingleGranule(
/// Column must not be empty. (column.size() !== 0)
void MergeTreeDataPartWriterWide::writeColumn(
const String & name,
const IDataType & type,
const NameAndTypePair & name_and_type,
const IColumn & column,
WrittenOffsetColumns & offset_columns,
const Granules & granules)
{
if (granules.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty granules for column {}, current mark {}", backQuoteIfNeed(name), getCurrentMark());
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty granules for column {}, current mark {}", backQuoteIfNeed(name_and_type.name), getCurrentMark());
const auto & [name, type] = name_and_type;
auto [it, inserted] = serialization_states.emplace(name, nullptr);
if (inserted)
{
IDataType::SerializeBinaryBulkSettings serialize_settings;
serialize_settings.getter = createStreamGetter(name, offset_columns);
type.serializeBinaryBulkStatePrefix(serialize_settings, it->second);
serialize_settings.getter = createStreamGetter(name_and_type, offset_columns);
type->serializeBinaryBulkStatePrefix(serialize_settings, it->second);
}
const auto & global_settings = storage.global_context.getSettingsRef();
IDataType::SerializeBinaryBulkSettings serialize_settings;
serialize_settings.getter = createStreamGetter(name, offset_columns);
serialize_settings.getter = createStreamGetter(name_and_type, offset_columns);
serialize_settings.low_cardinality_max_dictionary_size = global_settings.low_cardinality_max_dictionary_size;
serialize_settings.low_cardinality_use_single_dictionary_for_part = global_settings.low_cardinality_use_single_dictionary_for_part != 0;
@ -353,12 +350,11 @@ void MergeTreeDataPartWriterWide::writeColumn(
{
if (last_non_written_marks.count(name))
throw Exception(ErrorCodes::LOGICAL_ERROR, "We have to add new mark for column, but already have non written mark. Current mark {}, total marks {}, offset {}", getCurrentMark(), index_granularity.getMarksCount(), rows_written_in_last_mark);
last_non_written_marks[name] = getCurrentMarksForColumn(name, type, offset_columns, serialize_settings.path);
last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, offset_columns, serialize_settings.path);
}
writeSingleGranule(
name,
type,
name_and_type,
column,
offset_columns,
it->second,
@ -378,12 +374,12 @@ void MergeTreeDataPartWriterWide::writeColumn(
}
}
type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
name_and_type.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
if (is_offsets)
{
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path);
offset_columns.insert(stream_name);
}
}, serialize_settings.path);
@ -526,12 +522,12 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch
{
if (!serialization_states.empty())
{
serialize_settings.getter = createStreamGetter(it->name, written_offset_columns ? *written_offset_columns : offset_columns);
serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns);
it->type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]);
}
if (write_final_mark)
writeFinalMark(it->name, it->type, offset_columns, serialize_settings.path);
writeFinalMark(*it, offset_columns, serialize_settings.path);
}
}
for (auto & stream : column_streams)
@ -567,19 +563,18 @@ void MergeTreeDataPartWriterWide::finish(IMergeTreeDataPart::Checksums & checksu
}
void MergeTreeDataPartWriterWide::writeFinalMark(
const std::string & column_name,
const DataTypePtr column_type,
const NameAndTypePair & column,
WrittenOffsetColumns & offset_columns,
DB::IDataType::SubstreamPath & path)
{
writeSingleMark(column_name, *column_type, offset_columns, 0, path);
writeSingleMark(column, offset_columns, 0, path);
/// Memoize information about offsets
column_type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
column.type->enumerateStreams([&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
if (is_offsets)
{
String stream_name = IDataType::getFileNameForStream(column_name, substream_path);
String stream_name = IDataType::getFileNameForStream(column, substream_path);
offset_columns.insert(stream_name);
}
}, path);

View File

@ -40,16 +40,14 @@ private:
/// Return how many marks were written and
/// how many rows were written for last mark
void writeColumn(
const String & name,
const IDataType & type,
const NameAndTypePair & name_and_type,
const IColumn & column,
WrittenOffsetColumns & offset_columns,
const Granules & granules);
/// Write single granule of one column.
void writeSingleGranule(
const String & name,
const IDataType & type,
const NameAndTypePair & name_and_type,
const IColumn & column,
WrittenOffsetColumns & offset_columns,
IDataType::SerializeBinaryBulkStatePtr & serialization_state,
@ -58,8 +56,7 @@ private:
/// Take offsets from column and return as MarkInCompressed file with stream name
StreamsWithMarks getCurrentMarksForColumn(
const String & name,
const IDataType & type,
const NameAndTypePair & column,
WrittenOffsetColumns & offset_columns,
DB::IDataType::SubstreamPath & path);
@ -70,21 +67,18 @@ private:
/// Write mark for column taking offsets from column stream
void writeSingleMark(
const String & name,
const IDataType & type,
const NameAndTypePair & column,
WrittenOffsetColumns & offset_columns,
size_t number_of_rows,
DB::IDataType::SubstreamPath & path);
void writeFinalMark(
const std::string & column_name,
const DataTypePtr column_type,
const NameAndTypePair & column,
WrittenOffsetColumns & offset_columns,
DB::IDataType::SubstreamPath & path);
void addStreams(
const String & name,
const IDataType & type,
const NameAndTypePair & column,
const ASTPtr & effective_codec_desc);
/// Method for self check (used in debug-build only). Checks that written
@ -106,7 +100,7 @@ private:
/// Also useful to have exact amount of rows in last (non-final) mark.
void adjustLastMarkIfNeedAndFlushToDisk(size_t new_rows_in_last_mark);
IDataType::OutputStreamGetter createStreamGetter(const String & name, WrittenOffsetColumns & offset_columns) const;
IDataType::OutputStreamGetter createStreamGetter(const NameAndTypePair & column, WrittenOffsetColumns & offset_columns) const;
using SerializationState = IDataType::SerializeBinaryBulkStatePtr;
using SerializationStates = std::unordered_map<String, SerializationState>;

View File

@ -14,6 +14,8 @@ struct MergeTreeReaderSettings
/// If save_marks_in_cache is false, then, if marks are not in cache,
/// we will load them but won't save in the cache, to avoid evicting other data.
bool save_marks_in_cache = false;
/// Convert old-style nested (single arrays with same prefix, `n.a`, `n.b`...) to subcolumns of data type Nested.
bool convert_nested_to_subcolumns = false;
};
struct MergeTreeWriterSettings

View File

@ -93,7 +93,7 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr)
{
const auto & column = index_sample_block.getByPosition(i);
const auto & type = column.type;
auto new_column = type->createColumn();
ColumnPtr new_column = type->createColumn();
IDataType::DeserializeBinaryBulkSettings settings;
settings.getter = [&](IDataType::SubstreamPath) -> ReadBuffer * { return &istr; };
@ -101,9 +101,9 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr)
IDataType::DeserializeBinaryBulkStatePtr state;
type->deserializeBinaryBulkStatePrefix(settings, state);
type->deserializeBinaryBulkWithMultipleStreams(*new_column, rows_to_read, settings, state);
type->deserializeBinaryBulkWithMultipleStreams(new_column, rows_to_read, settings, state);
block.insert(ColumnWithTypeAndName(new_column->getPtr(), type, column.name));
block.insert(ColumnWithTypeAndName(new_column, type, column.name));
}
}

View File

@ -53,14 +53,14 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
auto name_and_type = columns.begin();
for (size_t i = 0; i < columns_num; ++i, ++name_and_type)
{
const auto & [name, type] = getColumnFromPart(*name_and_type);
auto position = data_part->getColumnPosition(name);
auto column_from_part = getColumnFromPart(*name_and_type);
if (!position && typeid_cast<const DataTypeArray *>(type.get()))
auto position = data_part->getColumnPosition(column_from_part.name);
if (!position && typeid_cast<const DataTypeArray *>(column_from_part.type.get()))
{
/// If array of Nested column is missing in part,
/// we have to read its offsets if they exist.
position = findColumnForOffsets(name);
position = findColumnForOffsets(column_from_part.name);
read_only_offsets[i] = (position != std::nullopt);
}
@ -133,10 +133,8 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading,
if (!column_positions[i])
continue;
bool append = res_columns[i] != nullptr;
if (!append)
if (res_columns[i] == nullptr)
res_columns[i] = getColumnFromPart(*column_it).type->createColumn();
mutable_columns[i] = res_columns[i]->assumeMutable();
}
while (read_rows < max_rows_to_read)
@ -146,20 +144,18 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading,
auto name_and_type = columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
{
auto column_from_part = getColumnFromPart(*name_and_type);
if (!res_columns[pos])
continue;
auto [name, type] = getColumnFromPart(*name_and_type);
auto & column = mutable_columns[pos];
try
{
auto & column = res_columns[pos];
size_t column_size_before_reading = column->size();
readData(name, *column, *type, from_mark, *column_positions[pos], rows_to_read, read_only_offsets[pos]);
readData(column_from_part, column, from_mark, *column_positions[pos], rows_to_read, read_only_offsets[pos]);
size_t read_rows_in_column = column->size() - column_size_before_reading;
if (read_rows_in_column < rows_to_read)
throw Exception("Cannot read all data in MergeTreeReaderCompact. Rows read: " + toString(read_rows_in_column) +
". Rows expected: " + toString(rows_to_read) + ".", ErrorCodes::CANNOT_READ_ALL_DATA);
@ -170,7 +166,7 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading,
storage.reportBrokenPart(data_part->name);
/// Better diagnostics.
e.addMessage("(while reading column " + name + ")");
e.addMessage("(while reading column " + column_from_part.name + ")");
throw;
}
catch (...)
@ -184,24 +180,17 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading,
read_rows += rows_to_read;
}
for (size_t i = 0; i < num_columns; ++i)
{
auto & column = mutable_columns[i];
if (column && !column->empty())
res_columns[i] = std::move(column);
else
res_columns[i] = nullptr;
}
next_mark = from_mark;
return read_rows;
}
void MergeTreeReaderCompact::readData(
const String & name, IColumn & column, const IDataType & type,
const NameAndTypePair & name_and_type, ColumnPtr & column,
size_t from_mark, size_t column_position, size_t rows_to_read, bool only_offsets)
{
const auto & [name, type] = name_and_type;
if (!isContinuousReading(from_mark, column_position))
seekToMark(from_mark, column_position);
@ -213,14 +202,25 @@ void MergeTreeReaderCompact::readData(
return data_buffer;
};
IDataType::DeserializeBinaryBulkStatePtr state;
IDataType::DeserializeBinaryBulkSettings deserialize_settings;
deserialize_settings.getter = buffer_getter;
deserialize_settings.avg_value_size_hint = avg_value_size_hints[name];
deserialize_settings.position_independent_encoding = true;
IDataType::DeserializeBinaryBulkStatePtr state;
type.deserializeBinaryBulkStatePrefix(deserialize_settings, state);
type.deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state);
if (name_and_type.isSubcolumn())
{
auto type_in_storage = name_and_type.getTypeInStorage();
ColumnPtr temp_column = type_in_storage->createColumn();
type_in_storage->deserializeBinaryBulkStatePrefix(deserialize_settings, state);
type_in_storage->deserializeBinaryBulkWithMultipleStreams(temp_column, rows_to_read, deserialize_settings, state);
column = type_in_storage->getSubcolumn(name_and_type.getSubcolumnName(), *temp_column);
}
else
{
type->deserializeBinaryBulkStatePrefix(deserialize_settings, state);
type->deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state);
}
/// The buffer is left in inconsistent state after reading single offsets
if (only_offsets)

View File

@ -56,8 +56,8 @@ private:
void seekToMark(size_t row_index, size_t column_index);
void readData(const String & name, IColumn & column, const IDataType & type,
size_t from_mark, size_t column_position, size_t rows_to_read, bool only_offsets = false);
void readData(const NameAndTypePair & name_and_type, ColumnPtr & column, size_t from_mark,
size_t column_position, size_t rows_to_read, bool only_offsets);
/// Returns maximal value of granule size in compressed file from @mark_ranges.
/// This value is used as size of read buffer.

View File

@ -12,6 +12,7 @@ namespace ErrorCodes
{
extern const int CANNOT_READ_ALL_DATA;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int LOGICAL_ERROR;
}
@ -38,6 +39,19 @@ MergeTreeReaderInMemory::MergeTreeReaderInMemory(
}
}
static ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & name_and_type)
{
auto storage_name = name_and_type.getNameInStorage();
if (!block.has(storage_name))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not found column '{}' in block", storage_name);
const auto & column = block.getByName(storage_name).column;
if (name_and_type.isSubcolumn())
return name_and_type.getTypeInStorage()->getSubcolumn(name_and_type.getSubcolumnName(), *column);
return column;
}
size_t MergeTreeReaderInMemory::readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
{
if (!continue_reading)
@ -60,17 +74,17 @@ size_t MergeTreeReaderInMemory::readRows(size_t from_mark, bool continue_reading
auto column_it = columns.begin();
for (size_t i = 0; i < num_columns; ++i, ++column_it)
{
auto [name, type] = getColumnFromPart(*column_it);
auto name_type = getColumnFromPart(*column_it);
/// Copy offsets, if array of Nested column is missing in part.
auto offsets_it = positions_for_offsets.find(name);
if (offsets_it != positions_for_offsets.end())
auto offsets_it = positions_for_offsets.find(name_type.name);
if (offsets_it != positions_for_offsets.end() && !name_type.isSubcolumn())
{
const auto & source_offsets = assert_cast<const ColumnArray &>(
*part_in_memory->block.getByPosition(offsets_it->second).column).getOffsets();
if (res_columns[i] == nullptr)
res_columns[i] = type->createColumn();
res_columns[i] = name_type.type->createColumn();
auto mutable_column = res_columns[i]->assumeMutable();
auto & res_offstes = assert_cast<ColumnArray &>(*mutable_column).getOffsets();
@ -80,9 +94,9 @@ size_t MergeTreeReaderInMemory::readRows(size_t from_mark, bool continue_reading
res_columns[i] = std::move(mutable_column);
}
else if (part_in_memory->block.has(name))
else if (part_in_memory->hasColumnFiles(name_type))
{
const auto & block_column = part_in_memory->block.getByName(name).column;
auto block_column = getColumnFromBlock(part_in_memory->block, name_type);
if (rows_to_read == part_rows)
{
res_columns[i] = block_column;
@ -90,7 +104,7 @@ size_t MergeTreeReaderInMemory::readRows(size_t from_mark, bool continue_reading
else
{
if (res_columns[i] == nullptr)
res_columns[i] = type->createColumn();
res_columns[i] = name_type.type->createColumn();
auto mutable_column = res_columns[i]->assumeMutable();
mutable_column->insertRangeFrom(*block_column, total_rows_read, rows_to_read);

View File

@ -9,7 +9,6 @@
#include <Common/escapeForFileName.h>
#include <Common/typeid_cast.h>
namespace DB
{
@ -50,7 +49,7 @@ MergeTreeReaderWide::MergeTreeReaderWide(
for (const NameAndTypePair & column : columns)
{
auto column_from_part = getColumnFromPart(column);
addStreams(column_from_part.name, *column_from_part.type, profile_callback_, clock_type_);
addStreams(column_from_part, profile_callback_, clock_type_);
}
}
catch (...)
@ -73,48 +72,26 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si
/// If append is true, then the value will be equal to nullptr and will be used only to
/// check that the offsets column has been already read.
OffsetColumns offset_columns;
std::unordered_map<String, IDataType::SubstreamsCache> caches;
auto name_and_type = columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
{
auto [name, type] = getColumnFromPart(*name_and_type);
auto column_from_part = getColumnFromPart(*name_and_type);
const auto & [name, type] = column_from_part;
/// The column is already present in the block so we will append the values to the end.
bool append = res_columns[pos] != nullptr;
if (!append)
res_columns[pos] = type->createColumn();
/// To keep offsets shared. TODO Very dangerous. Get rid of this.
MutableColumnPtr column = res_columns[pos]->assumeMutable();
bool read_offsets = true;
/// For nested data structures collect pointers to offset columns.
if (const auto * type_arr = typeid_cast<const DataTypeArray *>(type.get()))
{
String table_name = Nested::extractTableName(name);
auto it_inserted = offset_columns.emplace(table_name, nullptr);
/// offsets have already been read on the previous iteration and we don't need to read it again
if (!it_inserted.second)
read_offsets = false;
/// need to create new offsets
if (it_inserted.second && !append)
it_inserted.first->second = ColumnArray::ColumnOffsets::create();
/// share offsets in all elements of nested structure
if (!append)
column = ColumnArray::create(type_arr->getNestedType()->createColumn(),
it_inserted.first->second)->assumeMutable();
}
auto & column = res_columns[pos];
try
{
size_t column_size_before_reading = column->size();
auto & cache = caches[column_from_part.getNameInStorage()];
readData(name, *type, *column, from_mark, continue_reading, max_rows_to_read, read_offsets);
readData(column_from_part, column, from_mark, continue_reading, max_rows_to_read, cache);
/// For elements of Nested, column_size_before_reading may be greater than column size
/// if offsets are not empty and were already read, but elements are empty.
@ -130,8 +107,6 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si
if (column->empty())
res_columns[pos] = nullptr;
else
res_columns[pos] = std::move(column);
}
/// NOTE: positions for all streams must be kept in sync.
@ -159,12 +134,12 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si
return read_rows;
}
void MergeTreeReaderWide::addStreams(const String & name, const IDataType & type,
void MergeTreeReaderWide::addStreams(const NameAndTypePair & name_and_type,
const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type)
{
IDataType::StreamCallback callback = [&] (const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path);
if (streams.count(stream_name))
return;
@ -186,24 +161,24 @@ void MergeTreeReaderWide::addStreams(const String & name, const IDataType & type
};
IDataType::SubstreamPath substream_path;
type.enumerateStreams(callback, substream_path);
name_and_type.type->enumerateStreams(callback, substream_path);
}
void MergeTreeReaderWide::readData(
const String & name, const IDataType & type, IColumn & column,
const NameAndTypePair & name_and_type, ColumnPtr & column,
size_t from_mark, bool continue_reading, size_t max_rows_to_read,
bool with_offsets)
IDataType::SubstreamsCache & cache)
{
auto get_stream_getter = [&](bool stream_for_prefix) -> IDataType::InputStreamGetter
{
return [&, stream_for_prefix](const IDataType::SubstreamPath & substream_path) -> ReadBuffer *
{
/// If offsets for arrays have already been read.
if (!with_offsets && substream_path.size() == 1 && substream_path[0].type == IDataType::Substream::ArraySizes)
/// If substream have already been read.
if (cache.count(IDataType::getSubcolumnNameForStream(substream_path)))
return nullptr;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
String stream_name = IDataType::getFileNameForStream(name_and_type, substream_path);
auto it = streams.find(stream_name);
if (it == streams.end())
@ -223,21 +198,21 @@ void MergeTreeReaderWide::readData(
};
};
double & avg_value_size_hint = avg_value_size_hints[name];
double & avg_value_size_hint = avg_value_size_hints[name_and_type.name];
IDataType::DeserializeBinaryBulkSettings deserialize_settings;
deserialize_settings.avg_value_size_hint = avg_value_size_hint;
if (deserialize_binary_bulk_state_map.count(name) == 0)
if (deserialize_binary_bulk_state_map.count(name_and_type.name) == 0)
{
deserialize_settings.getter = get_stream_getter(true);
type.deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]);
name_and_type.type->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name_and_type.name]);
}
deserialize_settings.getter = get_stream_getter(false);
deserialize_settings.continuous_reading = continue_reading;
auto & deserialize_state = deserialize_binary_bulk_state_map[name];
type.deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, deserialize_settings, deserialize_state);
IDataType::updateAvgValueSizeHint(column, avg_value_size_hint);
auto & deserialize_state = deserialize_binary_bulk_state_map[name_and_type.name];
name_and_type.type->deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, deserialize_settings, deserialize_state, &cache);
IDataType::updateAvgValueSizeHint(*column, avg_value_size_hint);
}
}

Some files were not shown because too many files have changed in this diff Show More