Merge branch 'master' into fix-annoy-index-update

This commit is contained in:
Tian Xinhui 2023-08-22 16:58:29 +08:00 committed by GitHub
commit 76016d9593
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
201 changed files with 5281 additions and 805 deletions

12
.gitmodules vendored
View File

@ -347,3 +347,15 @@
[submodule "contrib/incbin"]
path = contrib/incbin
url = https://github.com/graphitemaster/incbin.git
[submodule "contrib/usearch"]
path = contrib/usearch
url = https://github.com/unum-cloud/usearch.git
[submodule "contrib/SimSIMD"]
path = contrib/SimSIMD
url = https://github.com/ashvardanian/SimSIMD.git
[submodule "contrib/FP16"]
path = contrib/FP16
url = https://github.com/Maratyszcza/FP16.git
[submodule "contrib/robin-map"]
path = contrib/robin-map
url = https://github.com/Tessil/robin-map.git

View File

@ -11,6 +11,7 @@
#include <base/defines.h>
#include <base/types.h>
#include <base/unaligned.h>
#include <base/simd.h>
#include <city.h>
@ -29,6 +30,11 @@
#define CRC_INT __crc32cd
#endif
#if defined(__aarch64__) && defined(__ARM_NEON)
#include <arm_neon.h>
#pragma clang diagnostic ignored "-Wreserved-identifier"
#endif
/**
* The std::string_view-like container to avoid creating strings to find substrings in the hash table.
@ -74,14 +80,14 @@ using StringRefs = std::vector<StringRef>;
* For more information, see hash_map_string_2.cpp
*/
inline bool compareSSE2(const char * p1, const char * p2)
inline bool compare8(const char * p1, const char * p2)
{
return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(p1)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(p2))));
}
inline bool compareSSE2x4(const char * p1, const char * p2)
inline bool compare64(const char * p1, const char * p2)
{
return 0xFFFF == _mm_movemask_epi8(
_mm_and_si128(
@ -101,7 +107,30 @@ inline bool compareSSE2x4(const char * p1, const char * p2)
_mm_loadu_si128(reinterpret_cast<const __m128i *>(p2) + 3)))));
}
inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)
#elif defined(__aarch64__) && defined(__ARM_NEON)
inline bool compare8(const char * p1, const char * p2)
{
uint64_t mask = getNibbleMask(vceqq_u8(
vld1q_u8(reinterpret_cast<const unsigned char *>(p1)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2))));
return 0xFFFFFFFFFFFFFFFF == mask;
}
inline bool compare64(const char * p1, const char * p2)
{
uint64_t mask = getNibbleMask(vandq_u8(
vandq_u8(vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2))),
vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1 + 16)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2 + 16)))),
vandq_u8(vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1 + 32)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2 + 32))),
vceqq_u8(vld1q_u8(reinterpret_cast<const unsigned char *>(p1 + 48)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2 + 48))))));
return 0xFFFFFFFFFFFFFFFF == mask;
}
#endif
#if defined(__SSE2__) || (defined(__aarch64__) && defined(__ARM_NEON))
inline bool memequalWide(const char * p1, const char * p2, size_t size)
{
/** The order of branches and the trick with overlapping comparisons
* are the same as in memcpy implementation.
@ -138,7 +167,7 @@ inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)
while (size >= 64)
{
if (compareSSE2x4(p1, p2))
if (compare64(p1, p2))
{
p1 += 64;
p2 += 64;
@ -150,17 +179,16 @@ inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)
switch (size / 16)
{
case 3: if (!compareSSE2(p1 + 32, p2 + 32)) return false; [[fallthrough]];
case 2: if (!compareSSE2(p1 + 16, p2 + 16)) return false; [[fallthrough]];
case 1: if (!compareSSE2(p1, p2)) return false;
case 3: if (!compare8(p1 + 32, p2 + 32)) return false; [[fallthrough]];
case 2: if (!compare8(p1 + 16, p2 + 16)) return false; [[fallthrough]];
case 1: if (!compare8(p1, p2)) return false;
}
return compareSSE2(p1 + size - 16, p2 + size - 16);
return compare8(p1 + size - 16, p2 + size - 16);
}
#endif
inline bool operator== (StringRef lhs, StringRef rhs)
{
if (lhs.size != rhs.size)
@ -169,8 +197,8 @@ inline bool operator== (StringRef lhs, StringRef rhs)
if (lhs.size == 0)
return true;
#if defined(__SSE2__)
return memequalSSE2Wide(lhs.data, rhs.data, lhs.size);
#if defined(__SSE2__) || (defined(__aarch64__) && defined(__ARM_NEON))
return memequalWide(lhs.data, rhs.data, lhs.size);
#else
return 0 == memcmp(lhs.data, rhs.data, lhs.size);
#endif

14
base/base/simd.h Normal file
View File

@ -0,0 +1,14 @@
#pragma once
#if defined(__aarch64__) && defined(__ARM_NEON)
# include <arm_neon.h>
# pragma clang diagnostic ignored "-Wreserved-identifier"
/// Returns a 64 bit mask of nibbles (4 bits for each byte).
inline uint64_t getNibbleMask(uint8x16_t res)
{
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(res), 4)), 0);
}
#endif

View File

@ -196,6 +196,17 @@ if (ARCH_S390X)
add_contrib(crc32-s390x-cmake crc32-s390x)
endif()
add_contrib (annoy-cmake annoy)
option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES})
if (ENABLE_USEARCH)
add_contrib (FP16-cmake FP16)
add_contrib (robin-map-cmake robin-map)
add_contrib (SimSIMD-cmake SimSIMD)
add_contrib (usearch-cmake usearch) # requires: FP16, robin-map, SimdSIMD
else ()
message(STATUS "Not using USearch")
endif ()
add_contrib (xxHash-cmake xxHash)
add_contrib (libbcrypt-cmake libbcrypt)

1
contrib/FP16 vendored Submodule

@ -0,0 +1 @@
Subproject commit 0a92994d729ff76a58f692d3028ca1b64b145d91

View File

@ -0,0 +1 @@
# See contrib/usearch-cmake/CMakeLists.txt

1
contrib/SimSIMD vendored Submodule

@ -0,0 +1 @@
Subproject commit de2cb75b9e9e3389d5e1e51fd9f8ed151f3c17cf

View File

@ -0,0 +1 @@
# See contrib/usearch-cmake/CMakeLists.txt

2
contrib/boost vendored

@ -1 +1 @@
Subproject commit bb179652862b528d94a9032a784796c4db846c3f
Subproject commit 063a9372b4ae304e869a5c5724971d0501552731

View File

@ -19,6 +19,12 @@ add_library (_boost_filesystem ${SRCS_FILESYSTEM})
add_library (boost::filesystem ALIAS _boost_filesystem)
target_include_directories (_boost_filesystem SYSTEM BEFORE PUBLIC ${LIBRARY_DIR})
if (OS_LINUX)
target_compile_definitions (_boost_filesystem PRIVATE
BOOST_FILESYSTEM_HAS_POSIX_AT_APIS=1
)
endif ()
# headers-only
add_library (_boost_headers_only INTERFACE)

View File

@ -1,6 +1,7 @@
option(ENABLE_ISAL_LIBRARY "Enable ISA-L library" ${ENABLE_LIBRARIES})
if (ARCH_AARCH64)
# Disable ISA-L libray on aarch64.
# ISA-L is only available for x86-64, so it shall be disabled for other platforms
if (NOT ARCH_AMD64)
set (ENABLE_ISAL_LIBRARY OFF)
endif ()

View File

@ -147,7 +147,7 @@ target_compile_definitions(_libarchive PUBLIC
target_compile_options(_libarchive PRIVATE "-Wno-reserved-macro-identifier")
if (TARGET ch_contrib::xz)
target_compile_definitions(_libarchive PUBLIC HAVE_LZMA_H=1)
target_compile_definitions(_libarchive PUBLIC HAVE_LZMA_H=1 HAVE_LIBLZMA=1)
target_link_libraries(_libarchive PRIVATE ch_contrib::xz)
endif()
@ -156,6 +156,16 @@ if (TARGET ch_contrib::zlib)
target_link_libraries(_libarchive PRIVATE ch_contrib::zlib)
endif()
if (TARGET ch_contrib::zstd)
target_compile_definitions(_libarchive PUBLIC HAVE_ZSTD_H=1 HAVE_LIBZSTD=1)
target_link_libraries(_libarchive PRIVATE ch_contrib::zstd)
endif()
if (TARGET ch_contrib::bzip2)
target_compile_definitions(_libarchive PUBLIC HAVE_BZLIB_H=1)
target_link_libraries(_libarchive PRIVATE ch_contrib::bzip2)
endif()
if (OS_LINUX)
target_compile_definitions(
_libarchive PUBLIC

@ -1 +1 @@
Subproject commit 4ef26de16c229429141e424375142c9b03234b66
Subproject commit e7b8befca85c8b847614432dba250c22d35fbae0

2
contrib/orc vendored

@ -1 +1 @@
Subproject commit 568d1d60c250af1890f226c182bc15bd8cc94cf1
Subproject commit a20d1d9d7ad4a4be7b7ba97588e16ca8b9abb2b6

1
contrib/robin-map vendored Submodule

@ -0,0 +1 @@
Subproject commit 851a59e0e3063ee0e23089062090a73fd3de482d

View File

@ -0,0 +1 @@
# See contrib/usearch-cmake/CMakeLists.txt

2
contrib/snappy vendored

@ -1 +1 @@
Subproject commit fb057edfed820212076239fd32cb2ff23e9016bf
Subproject commit 6ebb5b1ab8801ea3fde103c5c29f5ab86df5fe7a

1
contrib/usearch vendored Submodule

@ -0,0 +1 @@
Subproject commit 387b78b28b17b8954024ffc81e97cbcfa10d1f30

View File

@ -0,0 +1,17 @@
set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include")
set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16")
set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map")
set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD-map")
add_library(_usearch INTERFACE)
target_include_directories(_usearch SYSTEM INTERFACE
${FP16_PROJECT_DIR}/include
${ROBIN_MAP_PROJECT_DIR}/include
${SIMSIMD_PROJECT_DIR}/include
${USEARCH_SOURCE_DIR})
add_library(ch_contrib::usearch ALIAS _usearch)
target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH)

View File

@ -19,9 +19,9 @@
<max_threads>12</max_threads>
<!-- disable JIT for perf tests -->
<compile_expressions>1</compile_expressions>
<compile_aggregate_expressions>1</compile_aggregate_expressions>
<compile_sort_description>1</compile_sort_description>
<compile_expressions>0</compile_expressions>
<compile_aggregate_expressions>0</compile_aggregate_expressions>
<compile_sort_description>0</compile_sort_description>
<!-- Don't fail some prewarm queries too early -->
<timeout_before_checking_execution_speed>60</timeout_before_checking_execution_speed>

View File

@ -63,6 +63,7 @@ configure
# it contains some new settings, but we can safely remove it
rm /etc/clickhouse-server/config.d/merge_tree.xml
rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
rm /etc/clickhouse-server/config.d/filesystem_caches_path.xml
rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
start
@ -93,6 +94,7 @@ sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_defau
# it contains some new settings, but we can safely remove it
rm /etc/clickhouse-server/config.d/merge_tree.xml
rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
rm /etc/clickhouse-server/config.d/filesystem_caches_path.xml
rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
start

View File

@ -0,0 +1,45 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v23.3.9.55-lts (b9c5c8622d3) FIXME as compared to v23.3.8.21-lts (1675f2264f3)
#### Performance Improvement
* Backported in [#52213](https://github.com/ClickHouse/ClickHouse/issues/52213): Do not store blocks in `ANY` hash join if nothing is inserted. [#48633](https://github.com/ClickHouse/ClickHouse/pull/48633) ([vdimir](https://github.com/vdimir)).
* Backported in [#52826](https://github.com/ClickHouse/ClickHouse/issues/52826): Fix incorrect projection analysis which invalidates primary keys. This issue only exists when `query_plan_optimize_primary_key = 1, query_plan_optimize_projection = 1` . This fixes [#48823](https://github.com/ClickHouse/ClickHouse/issues/48823) . This fixes [#51173](https://github.com/ClickHouse/ClickHouse/issues/51173) . [#52308](https://github.com/ClickHouse/ClickHouse/pull/52308) ([Amos Bird](https://github.com/amosbird)).
#### Build/Testing/Packaging Improvement
* Backported in [#53019](https://github.com/ClickHouse/ClickHouse/issues/53019): Packing inline cache into docker images sometimes causes strange special effects. Since we don't use it at all, it's good to go. [#53008](https://github.com/ClickHouse/ClickHouse/pull/53008) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#53288](https://github.com/ClickHouse/ClickHouse/issues/53288): The compiler's profile data (`-ftime-trace`) is uploaded to ClickHouse Cloud., the second attempt after [#53100](https://github.com/ClickHouse/ClickHouse/issues/53100). [#53213](https://github.com/ClickHouse/ClickHouse/pull/53213) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Backported in [#53461](https://github.com/ClickHouse/ClickHouse/issues/53461): Preserve environment parameters in `clickhouse start` command. Fixes [#51962](https://github.com/ClickHouse/ClickHouse/issues/51962). [#53418](https://github.com/ClickHouse/ClickHouse/pull/53418) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix optimization to move functions before sorting. [#51481](https://github.com/ClickHouse/ClickHouse/pull/51481) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix Block structure mismatch in Pipe::unitePipes for FINAL [#51492](https://github.com/ClickHouse/ClickHouse/pull/51492) ([Nikita Taranov](https://github.com/nickitat)).
* Fix binary arithmetic for Nullable(IPv4) [#51642](https://github.com/ClickHouse/ClickHouse/pull/51642) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
* Support IPv4 and IPv6 as dictionary attributes [#51756](https://github.com/ClickHouse/ClickHouse/pull/51756) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
* Fix ORDER BY tuple of WINDOW functions [#52145](https://github.com/ClickHouse/ClickHouse/pull/52145) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Disable expression templates for time intervals [#52335](https://github.com/ClickHouse/ClickHouse/pull/52335) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix `countSubstrings()` hang with empty needle and a column haystack [#52409](https://github.com/ClickHouse/ClickHouse/pull/52409) ([Sergei Trifonov](https://github.com/serxa)).
* Fixed inserting into Buffer engine [#52440](https://github.com/ClickHouse/ClickHouse/pull/52440) ([Vasily Nemkov](https://github.com/Enmk)).
* The implementation of AnyHash was non-conformant. [#52448](https://github.com/ClickHouse/ClickHouse/pull/52448) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* init and destroy ares channel on demand.. [#52634](https://github.com/ClickHouse/ClickHouse/pull/52634) ([Arthur Passos](https://github.com/arthurpassos)).
* Fix crash in function `tuple` with one sparse column argument [#52659](https://github.com/ClickHouse/ClickHouse/pull/52659) ([Anton Popov](https://github.com/CurtizJ)).
* clickhouse-keeper: fix implementation of server with poll() [#52833](https://github.com/ClickHouse/ClickHouse/pull/52833) ([Andy Fiddaman](https://github.com/citrus-it)).
* Fix password leak in show create mysql table [#52962](https://github.com/ClickHouse/ClickHouse/pull/52962) ([Duc Canh Le](https://github.com/canhld94)).
* Fix incorrect normal projection AST format [#53347](https://github.com/ClickHouse/ClickHouse/pull/53347) ([Amos Bird](https://github.com/amosbird)).
* Fix loading lazy database during system.table select query [#53372](https://github.com/ClickHouse/ClickHouse/pull/53372) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Fix wrong columns order for queries with parallel FINAL. [#53489](https://github.com/ClickHouse/ClickHouse/pull/53489) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix: interpolate expression takes source column instead of same name aliased from select expression. [#53572](https://github.com/ClickHouse/ClickHouse/pull/53572) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Fix crash in comparison functions due to incorrect query analysis [#52172](https://github.com/ClickHouse/ClickHouse/pull/52172) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix deadlocks in StorageTableFunctionProxy [#52626](https://github.com/ClickHouse/ClickHouse/pull/52626) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Disable test_reverse_dns_query/test.py [#53195](https://github.com/ClickHouse/ClickHouse/pull/53195) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Disable test_host_regexp_multiple_ptr_records/test.py [#53211](https://github.com/ClickHouse/ClickHouse/pull/53211) ([Alexander Tokmakov](https://github.com/tavplubix)).

View File

@ -13,7 +13,7 @@ If more than one table is required, it is highly recommended to use the [Materia
``` sql
CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64)
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password')
ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_table', 'postgres_user', 'postgres_password')
PRIMARY KEY key;
```

View File

@ -142,13 +142,15 @@ was specified for ANN indexes, the default value is 100 million.
- [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy)
- [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch)
## Annoy {#annoy}
Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently
disabled on ARM due to memory safety problems with the algorithm.
This type of ANN index implements [the Annoy algorithm](https://github.com/spotify/annoy) which is based on a recursive division of the
space in random linear surfaces (lines in 2D, planes in 3D etc.).
This type of ANN index is based on the [Annoy library](https://github.com/spotify/annoy) which recursively divides the space into random
linear surfaces (lines in 2D, planes in 3D etc.).
<div class='vimeo-container'>
<iframe src="//www.youtube.com/embed/QkCCyLW0ehU"
@ -221,3 +223,59 @@ SETTINGS annoy_index_search_k_nodes=100;
The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
:::
## USearch {#usearch}
This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
algorithm](https://arxiv.org/abs/1603.09320), i.e., builds a hierarchical graph where each point represents a vector and the edges represent
similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the
overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors,
that are expensive to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further
distance computations on modern Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient
navigation around immutable persistent files, without loading them into RAM.
<div class='vimeo-container'>
<iframe src="//www.youtube.com/embed/UMrhB3icP9w"
width="640"
height="360"
frameborder="0"
allow="autoplay;
fullscreen;
picture-in-picture"
allowfullscreen>
</iframe>
</div>
Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column:
```sql
CREATE TABLE table_with_usearch_index
(
id Int64,
vectors Array(Float32),
INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column:
```sql
CREATE TABLE table_with_usearch_index
(
id Int64,
vectors Tuple(Float32[, Float32[, ...]]),
INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
USearch currently supports two distance functions:
- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
distance function was specified during index creation, `L2Distance` is used as default.

View File

@ -196,6 +196,7 @@ SELECT * FROM nestedt FORMAT TSV
- [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`.
- [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`.
- [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
- [input_format_tsv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_allow_variable_number_of_columns) - allow variable number of columns in TSV format, ignore extra columns and use default values on missing columns. Default value - `false`.
## TabSeparatedRaw {#tabseparatedraw}
@ -473,7 +474,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`.
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values. Default value - `false`.
- [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`.
- [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`.
## CSVWithNames {#csvwithnames}
@ -502,9 +503,10 @@ the types from input data will be compared with the types of the corresponding c
Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings.
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
If setting [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, trailing empty lines at the end of file will be skipped.
Additional settings:
- [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) - enables automatic detection of header with names and types if any. Default value - `true`.
- [input_format_custom_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_skip_trailing_empty_lines) - skip trailing empty lines at the end of file . Default value - `false`.
- [input_format_custom_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_custom_allow_variable_number_of_columns) - allow variable number of columns in CustomSeparated format, ignore extra columns and use default values on missing columns. Default value - `false`.
There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces).
@ -1262,6 +1264,7 @@ SELECT * FROM json_each_row_nested
- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
- [input_format_json_ignore_unknown_keys_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_ignore_unknown_keys_in_named_tuple) - Ignore unknown keys in json object for named tuples. Default value - `false`.
- [input_format_json_compact_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_json_compact_allow_variable_number_of_columns) - allow variable number of columns in JSONCompact/JSONCompactEachRow format, ignore extra columns and use default values on missing columns. Default value - `false`.
- [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
- [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
- [output_format_json_quote_denormals](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.

Binary file not shown.

After

Width:  |  Height:  |  Size: 232 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 246 KiB

View File

@ -6,7 +6,34 @@ sidebar_label: MySQL Interface
# MySQL Interface
ClickHouse supports MySQL wire protocol. To enable the MySQL wire protocol, add the [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d` folder:
ClickHouse supports the MySQL wire protocol. This allow tools that are MySQL-compatible to interact with ClickHouse seamlessly (e.g. [Looker Studio](../integrations/data-visualization/looker-studio-and-clickhouse.md)).
## Enabling the MySQL Interface On ClickHouse Cloud
1. After creating your ClickHouse Cloud Service, on the credentials screen, select the MySQL tab
![Credentials screen - Prompt](./images/mysql1.png)
2. Toggle the switch to enable the MySQL interface for this specific service. This will expose port `3306` for this service and prompt you with your MySQL connection screen that include your unique MySQL username. The password will be the same as the service's default user password.
![Credentials screen - Enabled MySQL](./images/mysql2.png)
Alternatively, in order to enable the MySQL interface for an existing service:
1. Ensure your service is in `Running` state then click on the "View connection string" button for the service you want to enable the MySQL interface for
![Connection screen - Prompt MySQL](./images/mysql3.png)
2. Toggle the switch to enable the MySQL interface for this specific service. This will prompt you to enter the default password.
![Connection screen - Prompt MySQL](./images/mysql4.png)
3. After entering the password, you will get prompted the MySQL connection string for this service
![Connection screen - MySQL Enabled](./images/mysql5.png)
## Enabling the MySQL Interface On Self-managed ClickHouse
Add the [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d/` [folder](../operations/configuration-files):
``` xml
<clickhouse>
@ -20,7 +47,7 @@ Startup your ClickHouse server and look for a log message similar to the followi
{} <Information> Application: Listening for MySQL compatibility protocol: 127.0.0.1:9004
```
## Connect mysql to ClickHouse
## Connect MySQL to ClickHouse
The following command demonstrates how to connect the MySQL client `mysql` to ClickHouse:

View File

@ -221,6 +221,10 @@ Default: 1024
Size of cache for index marks. Zero means disabled.
:::note
This setting can be modified at runtime and will take effect immediately.
:::
Type: UInt64
Default: 0
@ -230,6 +234,10 @@ Default: 0
Size of cache for uncompressed blocks of MergeTree indices. Zero means disabled.
:::note
This setting can be modified at runtime and will take effect immediately.
:::
Type: UInt64
Default: 0
@ -255,6 +263,10 @@ Default: SLRU
Size of cache for marks (index of MergeTree family of tables).
:::note
This setting can be modified at runtime and will take effect immediately.
:::
Type: UInt64
Default: 5368709120
@ -288,7 +300,7 @@ Default: 1000
Limit on total number of concurrently executed queries. Zero means Unlimited. Note that limits on insert and select queries, and on the maximum number of queries for users must also be considered. See also max_concurrent_insert_queries, max_concurrent_select_queries, max_concurrent_queries_for_all_users. Zero means unlimited.
:::note
These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
This setting can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
:::
Type: UInt64
@ -300,7 +312,7 @@ Default: 0
Limit on total number of concurrent insert queries. Zero means Unlimited.
:::note
These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
This setting can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
:::
Type: UInt64
@ -312,7 +324,7 @@ Default: 0
Limit on total number of concurrently select queries. Zero means Unlimited.
:::note
These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
This setting can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged.
:::
Type: UInt64
@ -456,6 +468,10 @@ Sets the cache size (in bytes) for mapped files. This setting allows avoiding fr
Note that the amount of data in mapped files does not consume memory directly and is not accounted for in query or server memory usage — because this memory can be discarded similar to the OS page cache. The cache is dropped (the files are closed) automatically on the removal of old parts in tables of the MergeTree family, also it can be dropped manually by the `SYSTEM DROP MMAP CACHE` query.
:::note
This setting can be modified at runtime and will take effect immediately.
:::
Type: UInt64
Default: 1000
@ -605,6 +621,10 @@ There is one shared cache for the server. Memory is allocated on demand. The cac
The uncompressed cache is advantageous for very short queries in individual cases.
:::note
This setting can be modified at runtime and will take effect immediately.
:::
Type: UInt64
Default: 0

View File

@ -627,6 +627,13 @@ Column type should be String. If value is empty, default names `row_{i}`will be
Default value: ''.
### input_format_json_compact_allow_variable_number_of_columns {#input_format_json_compact_allow_variable_number_of_columns}
Allow variable number of columns in rows in JSONCompact/JSONCompactEachRow input formats.
Ignore extra columns in rows with more columns than expected and treat missing columns as default values.
Disabled by default.
## TSV format settings {#tsv-format-settings}
### input_format_tsv_empty_as_default {#input_format_tsv_empty_as_default}
@ -764,6 +771,13 @@ When enabled, trailing empty lines at the end of TSV file will be skipped.
Disabled by default.
### input_format_tsv_allow_variable_number_of_columns {#input_format_tsv_allow_variable_number_of_columns}
Allow variable number of columns in rows in TSV input format.
Ignore extra columns in rows with more columns than expected and treat missing columns as default values.
Disabled by default.
## CSV format settings {#csv-format-settings}
### format_csv_delimiter {#format_csv_delimiter}
@ -955,9 +969,11 @@ Result
```text
" string "
```
### input_format_csv_allow_variable_number_of_columns {#input_format_csv_allow_variable_number_of_columns}
ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values.
Allow variable number of columns in rows in CSV input format.
Ignore extra columns in rows with more columns than expected and treat missing columns as default values.
Disabled by default.
@ -1571,6 +1587,13 @@ When enabled, trailing empty lines at the end of file in CustomSeparated format
Disabled by default.
### input_format_custom_allow_variable_number_of_columns {#input_format_custom_allow_variable_number_of_columns}
Allow variable number of columns in rows in CustomSeparated input format.
Ignore extra columns in rows with more columns than expected and treat missing columns as default values.
Disabled by default.
## Regexp format settings {#regexp-format-settings}
### format_regexp_escaping_rule {#format_regexp_escaping_rule}

View File

@ -1819,6 +1819,72 @@ Result:
└────────────────────────────────────┘
```
## toUTCTimestamp
Convert DateTime/DateTime64 type value from other time zone to UTC timezone timestamp
**Syntax**
``` sql
toUTCTimestamp(time_val, time_zone)
```
**Arguments**
- `time_val` — A DateTime/DateTime64 type const value or a expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md)
- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md)
**Returned value**
- DateTime/DateTime64 in text form
**Example**
``` sql
SELECT toUTCTimestamp(toDateTime('2023-03-16'), 'Asia/Shanghai');
```
Result:
``` text
┌─toUTCTimestamp(toDateTime('2023-03-16'),'Asia/Shanghai')┐
│ 2023-03-15 16:00:00 │
└─────────────────────────────────────────────────────────┘
```
## fromUTCTimestamp
Convert DateTime/DateTime64 type value from UTC timezone to other time zone timestamp
**Syntax**
``` sql
fromUTCTimestamp(time_val, time_zone)
```
**Arguments**
- `time_val` — A DateTime/DateTime64 type const value or a expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md)
- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md)
**Returned value**
- DateTime/DateTime64 in text form
**Example**
``` sql
SELECT fromUTCTimestamp(toDateTime64('2023-03-16 10:00:00', 3), 'Asia/Shanghai');
```
Result:
``` text
┌─fromUTCTimestamp(toDateTime64('2023-03-16 10:00:00',3),'Asia/Shanghai')─┐
│ 2023-03-16 18:00:00.000 │
└─────────────────────────────────────────────────────────────────────────┘
```
## Related content
- Blog: [Working with time series data in ClickHouse](https://clickhouse.com/blog/working-with-time-series-data-and-functions-ClickHouse)

View File

@ -66,13 +66,13 @@ RELOAD FUNCTION [ON CLUSTER cluster_name] function_name
## DROP DNS CACHE
Resets ClickHouses internal DNS cache. Sometimes (for old ClickHouse versions) it is necessary to use this command when changing the infrastructure (changing the IP address of another ClickHouse server or the server used by dictionaries).
Clears ClickHouses internal DNS cache. Sometimes (for old ClickHouse versions) it is necessary to use this command when changing the infrastructure (changing the IP address of another ClickHouse server or the server used by dictionaries).
For more convenient (automatic) cache management, see disable_internal_dns_cache, dns_cache_update_period parameters.
## DROP MARK CACHE
Resets the mark cache.
Clears the mark cache.
## DROP REPLICA
@ -106,22 +106,18 @@ Similar to `SYSTEM DROP REPLICA`, but removes the `Replicated` database replica
## DROP UNCOMPRESSED CACHE
Reset the uncompressed data cache.
Clears the uncompressed data cache.
The uncompressed data cache is enabled/disabled with the query/user/profile-level setting [use_uncompressed_cache](../../operations/settings/settings.md#setting-use_uncompressed_cache).
Its size can be configured using the server-level setting [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size).
## DROP COMPILED EXPRESSION CACHE
Reset the compiled expression cache.
Clears the compiled expression cache.
The compiled expression cache is enabled/disabled with the query/user/profile-level setting [compile_expressions](../../operations/settings/settings.md#compile-expressions).
## DROP QUERY CACHE
Resets the [query cache](../../operations/query-cache.md).
```sql
SYSTEM DROP QUERY CACHE [ON CLUSTER cluster_name]
```
Clears the [query cache](../../operations/query-cache.md).
## FLUSH LOGS
@ -443,9 +439,9 @@ SYSTEM STOP LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QU
```
- If `CUSTOM 'protocol'` modifier is specified, the custom protocol with the specified name defined in the protocols section of the server configuration will be stopped.
- If `QUERIES ALL` modifier is specified, all protocols are stopped.
- If `QUERIES DEFAULT` modifier is specified, all default protocols are stopped.
- If `QUERIES CUSTOM` modifier is specified, all custom protocols are stopped.
- If `QUERIES ALL [EXCEPT .. [,..]]` modifier is specified, all protocols are stopped, unless specified with `EXCEPT` clause.
- If `QUERIES DEFAULT [EXCEPT .. [,..]]` modifier is specified, all default protocols are stopped, unless specified with `EXCEPT` clause.
- If `QUERIES CUSTOM [EXCEPT .. [,..]]` modifier is specified, all custom protocols are stopped, unless specified with `EXCEPT` clause.
### SYSTEM START LISTEN

View File

@ -668,8 +668,7 @@ void LocalServer::processConfig()
uncompressed_cache_size = max_cache_size;
LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
if (uncompressed_cache_size)
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
String mark_cache_policy = config().getString("mark_cache_policy", DEFAULT_MARK_CACHE_POLICY);
size_t mark_cache_size = config().getUInt64("mark_cache_size", DEFAULT_MARK_CACHE_MAX_SIZE);
@ -680,8 +679,7 @@ void LocalServer::processConfig()
mark_cache_size = max_cache_size;
LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
}
if (mark_cache_size)
global_context->setMarkCache(mark_cache_policy, mark_cache_size);
global_context->setMarkCache(mark_cache_policy, mark_cache_size);
size_t index_uncompressed_cache_size = config().getUInt64("index_uncompressed_cache_size", DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE);
if (index_uncompressed_cache_size > max_cache_size)
@ -689,8 +687,7 @@ void LocalServer::processConfig()
index_uncompressed_cache_size = max_cache_size;
LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
if (index_uncompressed_cache_size)
global_context->setIndexUncompressedCache(index_uncompressed_cache_size);
global_context->setIndexUncompressedCache(index_uncompressed_cache_size);
size_t index_mark_cache_size = config().getUInt64("index_mark_cache_size", DEFAULT_INDEX_MARK_CACHE_MAX_SIZE);
if (index_mark_cache_size > max_cache_size)
@ -698,8 +695,7 @@ void LocalServer::processConfig()
index_mark_cache_size = max_cache_size;
LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
if (index_mark_cache_size)
global_context->setIndexMarkCache(index_mark_cache_size);
global_context->setIndexMarkCache(index_mark_cache_size);
size_t mmap_cache_size = config().getUInt64("mmap_cache_size", DEFAULT_MMAP_CACHE_MAX_SIZE);
if (mmap_cache_size > max_cache_size)
@ -707,11 +703,10 @@ void LocalServer::processConfig()
mmap_cache_size = max_cache_size;
LOG_INFO(log, "Lowered mmap file cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
if (mmap_cache_size)
global_context->setMMappedFileCache(mmap_cache_size);
global_context->setMMappedFileCache(mmap_cache_size);
/// In Server.cpp (./clickhouse-server), we would initialize the query cache here.
/// Intentionally not doing this in clickhouse-local as it doesn't make sense.
/// Initialize a dummy query cache.
global_context->setQueryCache(0, 0, 0, 0);
#if USE_EMBEDDED_COMPILER
size_t compiled_expression_cache_max_size_in_bytes = config().getUInt64("compiled_expression_cache_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE);

View File

@ -1105,6 +1105,69 @@ try
if (config().has("macros"))
global_context->setMacros(std::make_unique<Macros>(config(), "macros", log));
/// Set up caches.
const size_t max_cache_size = static_cast<size_t>(physical_server_memory * server_settings.cache_size_to_ram_max_ratio);
String uncompressed_cache_policy = server_settings.uncompressed_cache_policy;
size_t uncompressed_cache_size = server_settings.uncompressed_cache_size;
if (uncompressed_cache_size > max_cache_size)
{
uncompressed_cache_size = max_cache_size;
LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
String mark_cache_policy = server_settings.mark_cache_policy;
size_t mark_cache_size = server_settings.mark_cache_size;
if (mark_cache_size > max_cache_size)
{
mark_cache_size = max_cache_size;
LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
}
global_context->setMarkCache(mark_cache_policy, mark_cache_size);
size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
if (index_uncompressed_cache_size > max_cache_size)
{
index_uncompressed_cache_size = max_cache_size;
LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
global_context->setIndexUncompressedCache(index_uncompressed_cache_size);
size_t index_mark_cache_size = server_settings.index_mark_cache_size;
if (index_mark_cache_size > max_cache_size)
{
index_mark_cache_size = max_cache_size;
LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
global_context->setIndexMarkCache(index_mark_cache_size);
size_t mmap_cache_size = server_settings.mmap_cache_size;
if (mmap_cache_size > max_cache_size)
{
mmap_cache_size = max_cache_size;
LOG_INFO(log, "Lowered mmap file cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
global_context->setMMappedFileCache(mmap_cache_size);
size_t query_cache_max_size_in_bytes = config().getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
size_t query_cache_max_entries = config().getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
size_t query_cache_query_cache_max_entry_size_in_bytes = config().getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
size_t query_cache_max_entry_size_in_rows = config().getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
if (query_cache_max_size_in_bytes > max_cache_size)
{
query_cache_max_size_in_bytes = max_cache_size;
LOG_INFO(log, "Lowered query cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
global_context->setQueryCache(query_cache_max_size_in_bytes, query_cache_max_entries, query_cache_query_cache_max_entry_size_in_bytes, query_cache_max_entry_size_in_rows);
#if USE_EMBEDDED_COMPILER
size_t compiled_expression_cache_max_size_in_bytes = config().getUInt64("compiled_expression_cache_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE);
size_t compiled_expression_cache_max_elements = config().getUInt64("compiled_expression_cache_elements_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_ENTRIES);
CompiledExpressionCacheFactory::instance().init(compiled_expression_cache_max_size_in_bytes, compiled_expression_cache_max_elements);
#endif
/// Initialize main config reloader.
std::string include_from_path = config().getString("include_from", "/etc/metrika.xml");
@ -1324,7 +1387,14 @@ try
global_context->updateStorageConfiguration(*config);
global_context->updateInterserverCredentials(*config);
global_context->updateUncompressedCacheConfiguration(*config);
global_context->updateMarkCacheConfiguration(*config);
global_context->updateIndexUncompressedCacheConfiguration(*config);
global_context->updateIndexMarkCacheConfiguration(*config);
global_context->updateMMappedFileCacheConfiguration(*config);
global_context->updateQueryCacheConfiguration(*config);
CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, "encryption_codecs");
#if USE_SSL
CertificateReloader::instance().tryLoad(*config);
@ -1484,19 +1554,6 @@ try
/// Limit on total number of concurrently executed queries.
global_context->getProcessList().setMaxSize(server_settings.max_concurrent_queries);
/// Set up caches.
const size_t max_cache_size = static_cast<size_t>(physical_server_memory * server_settings.cache_size_to_ram_max_ratio);
String uncompressed_cache_policy = server_settings.uncompressed_cache_policy;
size_t uncompressed_cache_size = server_settings.uncompressed_cache_size;
if (uncompressed_cache_size > max_cache_size)
{
uncompressed_cache_size = max_cache_size;
LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
/// Load global settings from default_profile and system_profile.
global_context->setDefaultProfiles(config());
@ -1512,61 +1569,6 @@ try
server_settings.async_insert_queue_flush_on_shutdown));
}
String mark_cache_policy = server_settings.mark_cache_policy;
size_t mark_cache_size = server_settings.mark_cache_size;
if (!mark_cache_size)
LOG_ERROR(log, "Too low mark cache size will lead to severe performance degradation.");
if (mark_cache_size > max_cache_size)
{
mark_cache_size = max_cache_size;
LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
}
global_context->setMarkCache(mark_cache_policy, mark_cache_size);
size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
if (index_uncompressed_cache_size > max_cache_size)
{
index_uncompressed_cache_size = max_cache_size;
LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
if (index_uncompressed_cache_size)
global_context->setIndexUncompressedCache(server_settings.index_uncompressed_cache_size);
size_t index_mark_cache_size = server_settings.index_mark_cache_size;
if (index_mark_cache_size > max_cache_size)
{
index_mark_cache_size = max_cache_size;
LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
if (index_mark_cache_size)
global_context->setIndexMarkCache(server_settings.index_mark_cache_size);
size_t mmap_cache_size = server_settings.mmap_cache_size;
if (mmap_cache_size > max_cache_size)
{
mmap_cache_size = max_cache_size;
LOG_INFO(log, "Lowered mmap file cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
if (mmap_cache_size)
global_context->setMMappedFileCache(server_settings.mmap_cache_size);
size_t query_cache_max_size_in_bytes = config().getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
size_t query_cache_max_entries = config().getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
size_t query_cache_query_cache_max_entry_size_in_bytes = config().getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
size_t query_cache_max_entry_size_in_rows = config().getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
if (query_cache_max_size_in_bytes > max_cache_size)
{
query_cache_max_size_in_bytes = max_cache_size;
LOG_INFO(log, "Lowered query cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
}
global_context->setQueryCache(query_cache_max_size_in_bytes, query_cache_max_entries, query_cache_query_cache_max_entry_size_in_bytes, query_cache_max_entry_size_in_rows);
#if USE_EMBEDDED_COMPILER
size_t compiled_expression_cache_max_size_in_bytes = config().getUInt64("compiled_expression_cache_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE);
size_t compiled_expression_cache_max_elements = config().getUInt64("compiled_expression_cache_elements_size", DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_ENTRIES);
CompiledExpressionCacheFactory::instance().init(compiled_expression_cache_max_size_in_bytes, compiled_expression_cache_max_elements);
#endif
/// Set path for format schema files
fs::path format_schema_path(config().getString("format_schema_path", path / "format_schemas/"));
global_context->setFormatSchemaPath(format_schema_path);
@ -2072,6 +2074,9 @@ void Server::createServers(
for (const auto & protocol : protocols)
{
if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol))
continue;
std::string prefix = "protocols." + protocol + ".";
std::string port_name = prefix + "port";
std::string description {"<undefined> protocol"};
@ -2081,9 +2086,6 @@ void Server::createServers(
if (!config.has(prefix + "port"))
continue;
if (!server_type.shouldStart(ServerType::Type::CUSTOM, port_name))
continue;
std::vector<std::string> hosts;
if (config.has(prefix + "host"))
hosts.push_back(config.getString(prefix + "host"));

View File

@ -11,6 +11,7 @@
--background: linear-gradient(to bottom, #00CCFF, #00D0D0);
--chart-background: white;
--shadow-color: rgba(0, 0, 0, 0.25);
--moving-shadow-color: rgba(0, 0, 0, 0.5);
--input-shadow-color: rgba(0, 255, 0, 1);
--error-color: red;
--auth-error-color: white;
@ -34,6 +35,7 @@
--background: #151C2C;
--chart-background: #1b2834;
--shadow-color: rgba(0, 0, 0, 0);
--moving-shadow-color: rgba(255, 255, 255, 0.25);
--input-shadow-color: rgba(255, 128, 0, 0.25);
--error-color: #F66;
--legend-background: rgba(255, 255, 255, 0.25);
@ -91,6 +93,21 @@
position: relative;
}
.chart-maximized {
flex: 1 100%;
height: 75vh
}
.chart-moving {
z-index: 11;
box-shadow: 0 0 2rem var(--moving-shadow-color);
}
.chart-displaced {
opacity: 75%;
filter: blur(1px);
}
.chart div { position: absolute; }
.inputs {
@ -303,6 +320,7 @@
}
.chart-buttons a {
margin-right: 0.25rem;
user-select: none;
}
.chart-buttons a:hover {
color: var(--chart-button-hover-color);
@ -454,11 +472,13 @@
let host = 'https://play.clickhouse.com/';
let user = 'explorer';
let password = '';
let add_http_cors_header = true;
/// If it is hosted on server, assume that it is the address of ClickHouse.
if (location.protocol != 'file:') {
host = location.origin;
user = 'default';
add_http_cors_header = false;
}
const errorCodeMessageMap = {
@ -793,6 +813,92 @@ function insertChart(i) {
let edit_buttons = document.createElement('div');
edit_buttons.className = 'chart-buttons';
let move = document.createElement('a');
let move_text = document.createTextNode('✥');
move.appendChild(move_text);
let is_dragging = false;
move.addEventListener('mousedown', e => {
const idx = getCurrentIndex();
is_dragging = true;
chart.className = 'chart chart-moving';
let offset_x = e.clientX;
let offset_y = e.clientY;
let displace_idx = null;
let displace_chart = null;
function mouseup(e) {
is_dragging = false;
chart.className = 'chart';
chart.style.left = null;
chart.style.top = null;
if (displace_idx !== null) {
const elem = queries[idx];
queries.splice(idx, 1);
queries.splice(displace_idx, 0, elem);
displace_chart.className = 'chart';
drawAll();
}
}
function mousemove(e) {
if (!is_dragging) {
document.body.removeEventListener('mousemove', mousemove);
document.body.removeEventListener('mouseup', mouseup);
return;
}
let x = e.clientX - offset_x;
let y = e.clientY - offset_y;
chart.style.left = `${x}px`;
chart.style.top = `${y}px`;
displace_idx = null;
displace_chart = null;
let current_idx = -1;
for (const elem of charts.querySelectorAll('.chart')) {
++current_idx;
if (current_idx == idx) {
continue;
}
const this_rect = chart.getBoundingClientRect();
const this_center_x = this_rect.left + this_rect.width / 2;
const this_center_y = this_rect.top + this_rect.height / 2;
const elem_rect = elem.getBoundingClientRect();
if (this_center_x >= elem_rect.left && this_center_x <= elem_rect.right
&& this_center_y >= elem_rect.top && this_center_y <= elem_rect.bottom) {
elem.className = 'chart chart-displaced';
displace_idx = current_idx;
displace_chart = elem;
} else {
elem.className = 'chart';
}
}
}
document.body.addEventListener('mouseup', mouseup);
document.body.addEventListener('mousemove', mousemove);
});
let maximize = document.createElement('a');
let maximize_text = document.createTextNode('🗖');
maximize.appendChild(maximize_text);
maximize.addEventListener('click', e => {
const idx = getCurrentIndex();
chart.className = (chart.className == 'chart' ? 'chart chart-maximized' : 'chart');
resize();
});
let edit = document.createElement('a');
let edit_text = document.createTextNode('✎');
edit.appendChild(edit_text);
@ -825,6 +931,8 @@ function insertChart(i) {
saveState();
});
edit_buttons.appendChild(move);
edit_buttons.appendChild(maximize);
edit_buttons.appendChild(edit);
edit_buttons.appendChild(trash);
@ -962,8 +1070,6 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
};
}
let add_http_cors_header = false;
async function draw(idx, chart, url_params, query) {
if (plots[idx]) {
plots[idx].destroy();

View File

@ -46,7 +46,7 @@ void MultipleAccessStorage::setStorages(const std::vector<StoragePtr> & storages
{
std::lock_guard lock{mutex};
nested_storages = std::make_shared<const Storages>(storages);
ids_cache.reset();
ids_cache.clear();
}
void MultipleAccessStorage::addStorage(const StoragePtr & new_storage)
@ -69,7 +69,7 @@ void MultipleAccessStorage::removeStorage(const StoragePtr & storage_to_remove)
auto new_storages = std::make_shared<Storages>(*nested_storages);
new_storages->erase(new_storages->begin() + index);
nested_storages = new_storages;
ids_cache.reset();
ids_cache.clear();
}
std::vector<StoragePtr> MultipleAccessStorage::getStorages()

View File

@ -109,7 +109,7 @@ public:
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
{
writeBinary(this->data(place).numerator, buf);
writeBinaryLittleEndian(this->data(place).numerator, buf);
if constexpr (std::is_unsigned_v<Denominator>)
writeVarUInt(this->data(place).denominator, buf);
@ -119,7 +119,7 @@ public:
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
{
readBinary(this->data(place).numerator, buf);
readBinaryLittleEndian(this->data(place).numerator, buf);
if constexpr (std::is_unsigned_v<Denominator>)
readVarUInt(this->data(place).denominator, buf);

View File

@ -375,7 +375,7 @@ void BackupImpl::readBackupMetadata()
if (!archive_reader->fileExists(".backup"))
throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Archive {} is not a backup", backup_name_for_logging);
setCompressedSize();
in = archive_reader->readFile(".backup");
in = archive_reader->readFile(".backup", /*throw_on_not_found=*/true);
}
else
{
@ -685,7 +685,7 @@ std::unique_ptr<SeekableReadBuffer> BackupImpl::readFileImpl(const SizeAndChecks
{
/// Make `read_buffer` if there is data for this backup entry in this backup.
if (use_archive)
read_buffer = archive_reader->readFile(info.data_file_name);
read_buffer = archive_reader->readFile(info.data_file_name, /*throw_on_not_found=*/true);
else
read_buffer = reader->readFile(info.data_file_name);
}

View File

@ -599,6 +599,10 @@ if (TARGET ch_contrib::annoy)
dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
endif()
if (TARGET ch_contrib::usearch)
dbms_target_link_libraries(PUBLIC ch_contrib::usearch)
endif()
if (TARGET ch_rust::skim)
dbms_target_include_directories(PRIVATE $<TARGET_PROPERTY:ch_rust::skim,INTERFACE_INCLUDE_DIRECTORIES>)
dbms_target_link_libraries(PUBLIC ch_rust::skim)

View File

@ -151,7 +151,7 @@ public:
std::lock_guard cache_lock(mutex);
/// Insert the new value only if the token is still in present in insert_tokens.
/// (The token may be absent because of a concurrent reset() call).
/// (The token may be absent because of a concurrent clear() call).
bool result = false;
auto token_it = insert_tokens.find(key);
if (token_it != insert_tokens.end() && token_it->second.get() == token)
@ -179,13 +179,13 @@ public:
return cache_policy->dump();
}
void reset()
void clear()
{
std::lock_guard lock(mutex);
insert_tokens.clear();
hits = 0;
misses = 0;
cache_policy->reset(lock);
cache_policy->clear(lock);
}
void remove(const Key & key)

View File

@ -270,8 +270,8 @@ std::unordered_set<String> DNSResolver::reverseResolve(const Poco::Net::IPAddres
void DNSResolver::dropCache()
{
impl->cache_host.reset();
impl->cache_address.reset();
impl->cache_host.clear();
impl->cache_address.clear();
std::scoped_lock lock(impl->update_mutex, impl->drop_mutex);

View File

@ -20,7 +20,7 @@ template <typename T>
static inline void writeQuoted(const DecimalField<T> & x, WriteBuffer & buf)
{
writeChar('\'', buf);
writeText(x.getValue(), x.getScale(), buf, {});
writeText(x.getValue(), x.getScale(), buf, /* trailing_zeros */ true);
writeChar('\'', buf);
}

View File

@ -201,11 +201,11 @@ struct HashTableCell
void setMapped(const value_type & /*value*/) {}
/// Serialization, in binary and text form.
void write(DB::WriteBuffer & wb) const { DB::writeBinary(key, wb); }
void write(DB::WriteBuffer & wb) const { DB::writeBinaryLittleEndian(key, wb); }
void writeText(DB::WriteBuffer & wb) const { DB::writeDoubleQuoted(key, wb); }
/// Deserialization, in binary and text form.
void read(DB::ReadBuffer & rb) { DB::readBinary(key, rb); }
void read(DB::ReadBuffer & rb) { DB::readBinaryLittleEndian(key, rb); }
void readText(DB::ReadBuffer & rb) { DB::readDoubleQuoted(key, rb); }
/// When cell pointer is moved during erase, reinsert or resize operations

View File

@ -10,11 +10,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
template <typename T>
struct EqualWeightFunction
{
@ -46,8 +41,8 @@ public:
virtual size_t count(std::lock_guard<std::mutex> & /*cache_lock*/) const = 0;
virtual size_t maxSize(std::lock_guard<std::mutex>& /*cache_lock*/) const = 0;
virtual void setMaxCount(size_t /*max_count*/, std::lock_guard<std::mutex> & /* cache_lock */) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for cache policy"); }
virtual void setMaxSize(size_t /*max_size_in_bytes*/, std::lock_guard<std::mutex> & /* cache_lock */) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for cache policy"); }
virtual void setMaxCount(size_t /*max_count*/, std::lock_guard<std::mutex> & /* cache_lock */) = 0;
virtual void setMaxSize(size_t /*max_size_in_bytes*/, std::lock_guard<std::mutex> & /* cache_lock */) = 0;
virtual void setQuotaForUser(const String & user_name, size_t max_size_in_bytes, size_t max_entries, std::lock_guard<std::mutex> & /*cache_lock*/) { user_quotas->setQuotaForUser(user_name, max_size_in_bytes, max_entries); }
/// HashFunction usually hashes the entire key and the found key will be equal the provided key. In such cases, use get(). It is also
@ -60,7 +55,7 @@ public:
virtual void remove(const Key & key, std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
virtual void reset(std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
virtual void clear(std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
virtual std::vector<KeyMapped> dump() const = 0;
protected:

View File

@ -7,9 +7,8 @@
namespace DB
{
/// Cache policy LRU evicts entries which are not used for a long time.
/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size)
/// of that value.
/// Cache policy LRU evicts entries which are not used for a long time. Also see cache policy SLRU for reference.
/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size) of that value.
/// Cache starts to evict entries when their total weight exceeds max_size_in_bytes.
/// Value weight should not change after insertion.
/// To work with the thread-safe implementation of this class use a class "CacheBase" with first parameter "LRU"
@ -24,11 +23,12 @@ public:
using typename Base::OnWeightLossFunction;
/** Initialize LRUCachePolicy with max_size_in_bytes and max_count.
* max_size_in_bytes == 0 means the cache accepts no entries.
* max_count == 0 means no elements size restrictions.
*/
LRUCachePolicy(size_t max_size_in_bytes_, size_t max_count_, OnWeightLossFunction on_weight_loss_function_)
: Base(std::make_unique<NoCachePolicyUserQuota>())
, max_size_in_bytes(std::max(1uz, max_size_in_bytes_))
, max_size_in_bytes(max_size_in_bytes_)
, max_count(max_count_)
, on_weight_loss_function(on_weight_loss_function_)
{
@ -49,7 +49,19 @@ public:
return max_size_in_bytes;
}
void reset(std::lock_guard<std::mutex> & /* cache_lock */) override
void setMaxCount(size_t max_count_, std::lock_guard<std::mutex> & /* cache_lock */) override
{
max_count = max_count_;
removeOverflow();
}
void setMaxSize(size_t max_size_in_bytes_, std::lock_guard<std::mutex> & /* cache_lock */) override
{
max_size_in_bytes = max_size_in_bytes_;
removeOverflow();
}
void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
{
queue.clear();
cells.clear();
@ -155,8 +167,8 @@ private:
/// Total weight of values.
size_t current_size_in_bytes = 0;
const size_t max_size_in_bytes;
const size_t max_count;
size_t max_size_in_bytes;
size_t max_count;
WeightFunction weight_function;
OnWeightLossFunction on_weight_loss_function;
@ -172,10 +184,7 @@ private:
auto it = cells.find(key);
if (it == cells.end())
{
// Queue became inconsistent
abort();
}
std::terminate(); // Queue became inconsistent
const auto & cell = it->second;
@ -190,10 +199,7 @@ private:
on_weight_loss_function(current_weight_lost);
if (current_size_in_bytes > (1ull << 63))
{
// Queue became inconsistent
abort();
}
std::terminate(); // Queue became inconsistent
}
};

View File

@ -9,9 +9,8 @@ namespace DB
{
/// Cache policy SLRU evicts entries which were used only once and are not used for a long time,
/// this policy protects entries which were used more then once from a sequential scan.
/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size)
/// of that value.
/// this policy protects entries which were used more then once from a sequential scan. Also see cache policy LRU for reference.
/// WeightFunction is a functor that takes Mapped as a parameter and returns "weight" (approximate size) of that value.
/// Cache starts to evict entries when their total weight exceeds max_size_in_bytes.
/// Value weight should not change after insertion.
/// To work with the thread-safe implementation of this class use a class "CacheBase" with first parameter "SLRU"
@ -30,8 +29,9 @@ public:
* max_protected_size == 0 means that the default protected size is equal to half of the total max size.
*/
/// TODO: construct from special struct with cache policy parameters (also with max_protected_size).
SLRUCachePolicy(size_t max_size_in_bytes_, size_t max_count_, double size_ratio, OnWeightLossFunction on_weight_loss_function_)
SLRUCachePolicy(size_t max_size_in_bytes_, size_t max_count_, double size_ratio_, OnWeightLossFunction on_weight_loss_function_)
: Base(std::make_unique<NoCachePolicyUserQuota>())
, size_ratio(size_ratio_)
, max_protected_size(static_cast<size_t>(max_size_in_bytes_ * std::min(1.0, size_ratio)))
, max_size_in_bytes(max_size_in_bytes_)
, max_count(max_count_)
@ -54,7 +54,22 @@ public:
return max_size_in_bytes;
}
void reset(std::lock_guard<std::mutex> & /* cache_lock */) override
void setMaxCount(size_t max_count_, std::lock_guard<std::mutex> & /* cache_lock */) override
{
max_count = max_count_;
removeOverflow(protected_queue, max_protected_size, current_protected_size, /*is_protected=*/true);
removeOverflow(probationary_queue, max_size_in_bytes, current_size_in_bytes, /*is_protected=*/false);
}
void setMaxSize(size_t max_size_in_bytes_, std::lock_guard<std::mutex> & /* cache_lock */) override
{
max_protected_size = static_cast<size_t>(max_size_in_bytes_ * std::min(1.0, size_ratio));
max_size_in_bytes = max_size_in_bytes_;
removeOverflow(protected_queue, max_protected_size, current_protected_size, /*is_protected=*/true);
removeOverflow(probationary_queue, max_size_in_bytes, current_size_in_bytes, /*is_protected=*/false);
}
void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
{
cells.clear();
probationary_queue.clear();
@ -68,12 +83,13 @@ public:
auto it = cells.find(key);
if (it == cells.end())
return;
auto & cell = it->second;
current_size_in_bytes -= cell.size;
if (cell.is_protected)
{
current_protected_size -= cell.size;
}
auto & queue = cell.is_protected ? protected_queue : probationary_queue;
queue.erase(cell.queue_iterator);
cells.erase(it);
@ -192,16 +208,17 @@ private:
Cells cells;
const double size_ratio;
size_t current_protected_size = 0;
size_t current_size_in_bytes = 0;
const size_t max_protected_size;
const size_t max_size_in_bytes;
const size_t max_count;
size_t max_protected_size;
size_t max_size_in_bytes;
size_t max_count;
WeightFunction weight_function;
OnWeightLossFunction on_weight_loss_function;
void removeOverflow(SLRUQueue & queue, const size_t max_weight_size, size_t & current_weight_size, bool is_protected)
void removeOverflow(SLRUQueue & queue, size_t max_weight_size, size_t & current_weight_size, bool is_protected)
{
size_t current_weight_lost = 0;
size_t queue_size = queue.size();
@ -223,8 +240,7 @@ private:
{
need_remove = [&]()
{
return ((max_count != 0 && cells.size() > max_count)
|| (current_weight_size > max_weight_size)) && (queue_size > 0);
return ((max_count != 0 && cells.size() > max_count) || (current_weight_size > max_weight_size)) && (queue_size > 0);
};
}
@ -234,10 +250,7 @@ private:
auto it = cells.find(key);
if (it == cells.end())
{
// Queue became inconsistent
abort();
}
std::terminate(); // Queue became inconsistent
auto & cell = it->second;
@ -262,10 +275,7 @@ private:
on_weight_loss_function(current_weight_lost);
if (current_size_in_bytes > (1ull << 63))
{
// Queue became inconsistent
abort();
}
std::terminate(); // Queue became inconsistent
}
};

View File

@ -121,7 +121,7 @@ public:
max_size_in_bytes = max_size_in_bytes_;
}
void reset(std::lock_guard<std::mutex> & /* cache_lock */) override
void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
{
cache.clear();
}

View File

@ -2,6 +2,7 @@
#include <optional>
#include <base/types.h>
#include <base/simd.h>
#include <Common/BitHelpers.h>
#include <Poco/UTF8Encoding.h>
@ -72,16 +73,13 @@ inline size_t countCodePoints(const UInt8 * data, size_t size)
res += __builtin_popcount(_mm_movemask_epi8(
_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)), threshold)));
#elif defined(__aarch64__) && defined(__ARM_NEON)
/// Returns a 64 bit mask of nibbles (4 bits for each byte).
auto get_nibble_mask
= [](uint8x16_t input) -> uint64_t { return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
constexpr auto bytes_sse = 16;
const auto * src_end_sse = data + size / bytes_sse * bytes_sse;
const auto threshold = vdupq_n_s8(0xBF);
for (; data < src_end_sse; data += bytes_sse)
res += std::popcount(get_nibble_mask(vcgtq_s8(vld1q_s8(reinterpret_cast<const int8_t *>(data)), threshold)));
res += std::popcount(getNibbleMask(vcgtq_s8(vld1q_s8(reinterpret_cast<const int8_t *>(data)), threshold)));
res >>= 2;
#endif

View File

@ -4,6 +4,8 @@
#include <bit>
#include <cstdint>
#include <base/simd.h>
#include <Core/Defines.h>
@ -504,11 +506,6 @@ inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
# include <arm_neon.h>
# pragma clang diagnostic ignored "-Wreserved-identifier"
inline uint64_t getNibbleMask(uint8x16_t res)
{
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(res), 4)), 0);
}
template <typename Char>
inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
{

View File

@ -92,7 +92,7 @@ TEST(SLRUCache, removeFromProtected)
ASSERT_TRUE(value == nullptr);
}
TEST(SLRUCache, reset)
TEST(SLRUCache, clear)
{
using SimpleCacheBase = DB::CacheBase<int, int>;
auto slru_cache = SimpleCacheBase("SLRU", /*max_size_in_bytes=*/10, /*max_count=*/0, /*size_ratio*/0.5);
@ -101,7 +101,7 @@ TEST(SLRUCache, reset)
slru_cache.set(2, std::make_shared<int>(4)); /// add to protected_queue
slru_cache.reset();
slru_cache.clear();
auto value = slru_cache.get(1);
ASSERT_TRUE(value == nullptr);

View File

@ -73,8 +73,8 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest)
const char * const source_end = source + source_size;
while (source < source_end)
{
T curr_src = unalignedLoad<T>(source);
unalignedStore<T>(dest, curr_src - prev_src);
T curr_src = unalignedLoadLittleEndian<T>(source);
unalignedStoreLittleEndian<T>(dest, curr_src - prev_src);
prev_src = curr_src;
source += sizeof(T);
@ -94,10 +94,10 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest,
const char * const source_end = source + source_size;
while (source < source_end)
{
accumulator += unalignedLoad<T>(source);
accumulator += unalignedLoadLittleEndian<T>(source);
if (dest + sizeof(accumulator) > output_end) [[unlikely]]
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
unalignedStore<T>(dest, accumulator);
unalignedStoreLittleEndian<T>(dest, accumulator);
source += sizeof(T);
dest += sizeof(T);

View File

@ -86,6 +86,37 @@ struct DataTypeDecimalTrait
}
};
/// Calculates result = x * multiplier + delta.
/// If the multiplication or the addition overflows, returns false or throws DECIMAL_OVERFLOW.
template <typename T, bool throw_on_error>
inline bool multiplyAdd(const T & x, const T & multiplier, const T & delta, T & result)
{
T multiplied = 0;
if (common::mulOverflow(x, multiplier, multiplied))
{
if constexpr (throw_on_error)
throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
return false;
}
if (common::addOverflow(multiplied, delta, result))
{
if constexpr (throw_on_error)
throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
return false;
}
return true;
}
template <typename T>
inline T multiplyAdd(const T & x, const T & multiplier, const T & delta)
{
T res;
multiplyAdd<T, true>(x, multiplier, delta, res);
return res;
}
/** Make a decimal value from whole and fractional components with given scale multiplier.
* where scale_multiplier = scaleMultiplier<T>(scale)
* this is to reduce number of calls to scaleMultiplier when scale is known.
@ -104,23 +135,10 @@ inline bool decimalFromComponentsWithMultiplierImpl(
{
using T = typename DecimalType::NativeType;
const auto fractional_sign = whole < 0 ? -1 : 1;
T whole_scaled = 0;
if (common::mulOverflow(whole, scale_multiplier, whole_scaled))
{
if constexpr (throw_on_error)
throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
return false;
}
T value;
if (common::addOverflow(whole_scaled, fractional_sign * (fractional % scale_multiplier), value))
{
if constexpr (throw_on_error)
throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
if (!multiplyAdd<T, throw_on_error>(
whole, scale_multiplier, fractional_sign * (fractional % scale_multiplier), value))
return false;
}
result = DecimalType(value);
return true;
}

View File

@ -138,7 +138,7 @@ template <typename T> bool decimalEqual(T x, T y, UInt32 x_scale, UInt32 y_scale
template <typename T> bool decimalLess(T x, T y, UInt32 x_scale, UInt32 y_scale);
template <typename T> bool decimalLessOrEqual(T x, T y, UInt32 x_scale, UInt32 y_scale);
template <typename T>
template <is_decimal T>
class DecimalField
{
public:
@ -838,7 +838,7 @@ template <> struct Field::EnumToType<Field::Types::Decimal32> { using Type = Dec
template <> struct Field::EnumToType<Field::Types::Decimal64> { using Type = DecimalField<Decimal64>; };
template <> struct Field::EnumToType<Field::Types::Decimal128> { using Type = DecimalField<Decimal128>; };
template <> struct Field::EnumToType<Field::Types::Decimal256> { using Type = DecimalField<Decimal256>; };
template <> struct Field::EnumToType<Field::Types::AggregateFunctionState> { using Type = DecimalField<AggregateFunctionStateData>; };
template <> struct Field::EnumToType<Field::Types::AggregateFunctionState> { using Type = AggregateFunctionStateData; };
template <> struct Field::EnumToType<Field::Types::CustomType> { using Type = CustomType; };
template <> struct Field::EnumToType<Field::Types::Bool> { using Type = UInt64; };

View File

@ -39,7 +39,7 @@ namespace DB
M(UInt64, restore_threads, 16, "The maximum number of threads to execute RESTORE requests.", 0) \
M(Int32, max_connections, 1024, "Max server connections.", 0) \
M(UInt32, asynchronous_metrics_update_period_s, 1, "Period in seconds for updating asynchronous metrics.", 0) \
M(UInt32, asynchronous_heavy_metrics_update_period_s, 120, "Period in seconds for updating asynchronous metrics.", 0) \
M(UInt32, asynchronous_heavy_metrics_update_period_s, 120, "Period in seconds for updating heavy asynchronous metrics.", 0) \
M(String, default_database, "default", "Default database name.", 0) \
M(String, tmp_policy, "", "Policy for storage with temporary data.", 0) \
M(UInt64, max_temporary_data_on_disk_size, 0, "The maximum amount of storage that could be used for external aggregation, joins or sorting., ", 0) \

View File

@ -644,7 +644,7 @@ class IColumn;
M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \
M(Bool, database_replicated_allow_only_replicated_engine, false, "Allow to create only Replicated tables in database with engine Replicated", 0) \
M(Bool, database_replicated_allow_replicated_engine_arguments, true, "Allow to create only Replicated tables in database with engine Replicated with explicit arguments", 0) \
M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \
M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result, one of: 'none', 'throw', 'null_status_on_timeout', 'never_throw'", 0) \
M(UInt64, distributed_ddl_entry_format_version, 5, "Compatibility version of distributed DDL (ON CLUSTER) queries", 0) \
\
M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \
@ -779,6 +779,7 @@ class IColumn;
M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \
M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
@ -876,8 +877,10 @@ class IColumn;
M(Bool, input_format_orc_case_insensitive_column_matching, false, "Ignore case when matching ORC columns with CH columns.", 0) \
M(Bool, input_format_parquet_case_insensitive_column_matching, false, "Ignore case when matching Parquet columns with CH columns.", 0) \
M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \
M(Bool, input_format_parquet_filter_push_down, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.", 0) \
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \
@ -894,6 +897,10 @@ class IColumn;
M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \
M(Bool, input_format_csv_trim_whitespaces, true, "Trims spaces and tabs (\\t) characters at the beginning and end in CSV strings", 0) \
M(Bool, input_format_csv_use_default_on_bad_values, false, "Allow to set default value to column when CSV field deserialization failed on bad value", 0) \
M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \
M(Bool, input_format_tsv_allow_variable_number_of_columns, false, "Ignore extra columns in TSV input (if file has more columns than expected) and treat missing fields in TSV input as default values", 0) \
M(Bool, input_format_custom_allow_variable_number_of_columns, false, "Ignore extra columns in CustomSeparated input (if file has more columns than expected) and treat missing fields in CustomSeparated input as default values", 0) \
M(Bool, input_format_json_compact_allow_variable_number_of_columns, false, "Ignore extra columns in JSONCompact(EachRow) input (if file has more columns than expected) and treat missing fields in JSONCompact(EachRow) input as default values", 0) \
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
@ -1042,7 +1049,6 @@ class IColumn;
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
\
M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
M(Bool, input_format_csv_allow_variable_number_of_columns, false, "Ignore extra columns in CSV input (if file has more columns than expected) and treat missing fields in CSV input as default values", 0) \
M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \
// End of FORMAT_FACTORY_SETTINGS

View File

@ -46,6 +46,7 @@ public:
bool canBeUsedInBooleanContext() const override { return dictionary_type->canBeUsedInBooleanContext(); }
bool isValueRepresentedByNumber() const override { return dictionary_type->isValueRepresentedByNumber(); }
bool isValueRepresentedByInteger() const override { return dictionary_type->isValueRepresentedByInteger(); }
bool isValueRepresentedByUnsignedInteger() const override { return dictionary_type->isValueRepresentedByUnsignedInteger(); }
bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; }
bool haveMaximumSizeOfValue() const override { return dictionary_type->haveMaximumSizeOfValue(); }
size_t getMaximumSizeOfValueInMemory() const override { return dictionary_type->getMaximumSizeOfValueInMemory(); }

View File

@ -10,6 +10,7 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/parseDateTimeBestEffort.h>
#include <IO/ReadBufferFromString.h>
namespace DB
{
@ -145,12 +146,29 @@ void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & is
char maybe_quote = *istr.position();
if (maybe_quote == '\'' || maybe_quote == '\"')
{
++istr.position();
readText(x, istr, settings, time_zone, utc_time_zone);
if (maybe_quote == '\'' || maybe_quote == '\"')
readText(x, istr, settings, time_zone, utc_time_zone);
assertChar(maybe_quote, istr);
}
else
{
if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic)
{
readText(x, istr, settings, time_zone, utc_time_zone);
}
/// Best effort parsing supports datetime in format like "01.01.2000, 00:00:00"
/// and can mistakenly read comma as a part of datetime.
/// For example data "...,01.01.2000,some string,..." cannot be parsed correctly.
/// To fix this problem we first read CSV string and then try to parse it as datetime.
else
{
String datetime_str;
readCSVString(datetime_str, istr, settings.csv);
ReadBufferFromString buf(datetime_str);
readText(x, buf, settings, time_zone, utc_time_zone);
}
}
if (x < 0)
x = 0;

View File

@ -9,6 +9,7 @@
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <IO/parseDateTimeBestEffort.h>
#include <IO/ReadBufferFromString.h>
namespace DB
{
@ -143,12 +144,29 @@ void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer &
char maybe_quote = *istr.position();
if (maybe_quote == '\'' || maybe_quote == '\"')
{
++istr.position();
readText(x, scale, istr, settings, time_zone, utc_time_zone);
if (maybe_quote == '\'' || maybe_quote == '\"')
readText(x, scale, istr, settings, time_zone, utc_time_zone);
assertChar(maybe_quote, istr);
}
else
{
if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic)
{
readText(x, scale, istr, settings, time_zone, utc_time_zone);
}
/// Best effort parsing supports datetime in format like "01.01.2000, 00:00:00"
/// and can mistakenly read comma as a part of datetime.
/// For example data "...,01.01.2000,some string,..." cannot be parsed correctly.
/// To fix this problem we first read CSV string and then try to parse it as datetime.
else
{
String datetime_str;
readCSVString(datetime_str, istr, settings.csv);
ReadBufferFromString buf(datetime_str);
readText(x, scale, buf, settings, time_zone, utc_time_zone);
}
}
assert_cast<ColumnType &>(column).getData().push_back(x);
}

View File

@ -830,6 +830,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
query_context->setSetting("allow_experimental_hash_functions", 1);
query_context->setSetting("allow_experimental_object_type", 1);
query_context->setSetting("allow_experimental_annoy_index", 1);
query_context->setSetting("allow_experimental_usearch_index", 1);
query_context->setSetting("allow_experimental_bigint_types", 1);
query_context->setSetting("allow_experimental_window_functions", 1);
query_context->setSetting("allow_experimental_geo_types", 1);

View File

@ -23,11 +23,10 @@ StoragePtr IDatabase::getTable(const String & name, ContextPtr context) const
return storage;
TableNameHints hints(this->shared_from_this(), context);
std::vector<String> names = hints.getHints(name);
if (!names.empty())
{
if (names.empty())
throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name));
else
throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist. Maybe you meant {}?", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name), backQuoteIfNeed(names[0]));
}
else throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} does not exist", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name));
}
std::vector<std::pair<ASTPtr, StoragePtr>> IDatabase::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const

View File

@ -86,6 +86,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
format_settings.custom.try_detect_header = settings.input_format_custom_detect_header;
format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines;
format_settings.custom.allow_variable_number_of_columns = settings.input_format_custom_allow_variable_number_of_columns;
format_settings.date_time_input_format = settings.date_time_input_format;
format_settings.date_time_output_format = settings.date_time_output_format;
format_settings.interval.output_format = settings.interval_output_format;
@ -115,6 +116,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.json.validate_utf8 = settings.output_format_json_validate_utf8;
format_settings.json_object_each_row.column_for_object_name = settings.format_json_object_each_row_column_for_object_name;
format_settings.json.allow_object_type = context->getSettingsRef().allow_experimental_object_type;
format_settings.json.compact_allow_variable_number_of_columns = settings.input_format_json_compact_allow_variable_number_of_columns;
format_settings.null_as_default = settings.input_format_null_as_default;
format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros;
format_settings.parquet.row_group_rows = settings.output_format_parquet_row_group_size;
@ -122,6 +124,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.parquet.output_version = settings.output_format_parquet_version;
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
format_settings.parquet.preserve_order = settings.input_format_parquet_preserve_order;
format_settings.parquet.filter_push_down = settings.input_format_parquet_filter_push_down;
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string;
@ -163,6 +166,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header;
format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines;
format_settings.tsv.allow_variable_number_of_columns = settings.input_format_tsv_allow_variable_number_of_columns;
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
@ -186,6 +190,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching;
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;

View File

@ -90,9 +90,6 @@ private:
const FormatSettings & settings)>;
// Incompatible with FileSegmentationEngine.
//
// In future we may also want to pass some information about WHERE conditions (SelectQueryInfo?)
// and get some information about projections (min/max/count per column per row group).
using RandomAccessInputCreator = std::function<InputFormatPtr(
ReadBuffer & buf,
const Block & header,

View File

@ -175,6 +175,7 @@ struct FormatSettings
EscapingRule escaping_rule = EscapingRule::Escaped;
bool try_detect_header = true;
bool skip_trailing_empty_lines = false;
bool allow_variable_number_of_columns = false;
} custom;
struct
@ -197,6 +198,7 @@ struct FormatSettings
bool validate_types_from_metadata = true;
bool validate_utf8 = false;
bool allow_object_type = false;
bool compact_allow_variable_number_of_columns = false;
} json;
struct
@ -229,6 +231,7 @@ struct FormatSettings
bool allow_missing_columns = false;
bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false;
bool filter_push_down = true;
std::unordered_set<int> skip_row_groups = {};
bool output_string_as_string = false;
bool output_fixed_string_as_fixed_byte_array = true;
@ -317,6 +320,7 @@ struct FormatSettings
UInt64 skip_first_lines = 0;
bool try_detect_header = true;
bool skip_trailing_empty_lines = false;
bool allow_variable_number_of_columns = false;
} tsv;
struct
@ -344,6 +348,7 @@ struct FormatSettings
std::unordered_set<int> skip_stripes = {};
bool output_string_as_string = false;
ORCCompression output_compression_method = ORCCompression::NONE;
bool use_fast_decoder = true;
} orc;
/// For capnProto format we should determine how to

View File

@ -1,6 +1,7 @@
#pragma once
#include <type_traits>
#include <Core/AccurateComparison.h>
#include <Core/DecimalFunctions.h>
#include <Common/DateLUTImpl.h>
#include <DataTypes/DataTypeDate.h>
@ -14,7 +15,6 @@
#include <Functions/FunctionHelpers.h>
#include <Functions/castTypeToEither.h>
#include <Functions/extractTimeZoneFromFunctionArguments.h>
#include <Functions/TransformDateTime64.h>
#include <IO/WriteHelpers.h>
@ -36,7 +36,9 @@ namespace ErrorCodes
/// Corresponding types:
/// - UInt16 => DataTypeDate
/// - UInt32 => DataTypeDateTime
/// - Int32 => DataTypeDate32
/// - DateTime64 => DataTypeDateTime64
/// - Int8 => error
/// Please note that INPUT and OUTPUT types may differ, e.g.:
/// - 'AddSecondsImpl::execute(UInt32, ...) -> UInt32' is available to the ClickHouse users as 'addSeconds(DateTime, ...) -> DateTime'
/// - 'AddSecondsImpl::execute(UInt16, ...) -> UInt32' is available to the ClickHouse users as 'addSeconds(Date, ...) -> DateTime'
@ -45,35 +47,27 @@ struct AddNanosecondsImpl
{
static constexpr auto name = "addNanoseconds";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 scale = DataTypeDateTime64::default_scale)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(9 - scale);
auto division = std::div(t.fractional * multiplier + delta, static_cast<Int64>(1000000000));
return {t.whole * multiplier + division.quot, t.fractional * multiplier + delta};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(9 - scale);
return t * multiplier + delta;
return DateTime64(DecimalUtils::multiplyAdd(t.value, multiplier, delta));
}
static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(9);
return static_cast<UInt32>(t * multiplier + delta);
return DateTime64(DecimalUtils::multiplyAdd(static_cast<Int64>(t), multiplier, delta));
}
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED Int8 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addNanoSeconds() cannot be used with Date");
throw Exception(ErrorCodes::LOGICAL_ERROR, "addNanoseconds() cannot be used with Date");
}
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED Int8 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addNanoSeconds() cannot be used with Date32");
throw Exception(ErrorCodes::LOGICAL_ERROR, "addNanoseconds() cannot be used with Date32");
}
};
@ -81,43 +75,29 @@ struct AddMicrosecondsImpl
{
static constexpr auto name = "addMicroseconds";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(6 - scale));
if (scale <= 6)
{
auto division = std::div((t.fractional + delta), static_cast<Int64>(10e6));
return {t.whole * multiplier + division.quot, division.rem};
}
else
{
auto division = std::div((t.fractional + delta * multiplier), static_cast<Int64>(10e6 * multiplier));
return {t.whole + division.quot, division.rem};
}
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(6 - scale));
return scale <= 6 ? t * multiplier + delta : t + delta * multiplier;
return DateTime64(scale <= 6
? DecimalUtils::multiplyAdd(t.value, multiplier, delta)
: DecimalUtils::multiplyAdd(delta, multiplier, t.value));
}
static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(6);
return static_cast<UInt32>(t * multiplier + delta);
return DateTime64(DecimalUtils::multiplyAdd(static_cast<Int64>(t), multiplier, delta));
}
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED Int8 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMicroSeconds() cannot be used with Date");
throw Exception(ErrorCodes::LOGICAL_ERROR, "addMicroseconds() cannot be used with Date");
}
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED Int8 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMicroSeconds() cannot be used with Date32");
throw Exception(ErrorCodes::LOGICAL_ERROR, "addMicroseconds() cannot be used with Date32");
}
};
@ -125,43 +105,29 @@ struct AddMillisecondsImpl
{
static constexpr auto name = "addMilliseconds";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 scale = DataTypeDateTime64::default_scale)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(3 - scale));
if (scale <= 3)
{
auto division = std::div((t.fractional + delta), static_cast<Int64>(1000));
return {t.whole * multiplier + division.quot, division.rem};
}
else
{
auto division = std::div((t.fractional + delta * multiplier), static_cast<Int64>(1000 * multiplier));
return {t.whole + division.quot,division.rem};
}
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(std::abs(3 - scale));
return scale <= 3 ? t * multiplier + delta : t + delta * multiplier;
return DateTime64(scale <= 3
? DecimalUtils::multiplyAdd(t.value, multiplier, delta)
: DecimalUtils::multiplyAdd(delta, multiplier, t.value));
}
static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
{
Int64 multiplier = DecimalUtils::scaleMultiplier<DateTime64>(3);
return static_cast<UInt32>(t * multiplier + delta);
return DateTime64(DecimalUtils::multiplyAdd(static_cast<Int64>(t), multiplier, delta));
}
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED Int8 execute(UInt16, Int64, const DateLUTImpl &, UInt16 = 0)
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMilliSeconds() cannot be used with Date");
throw Exception(ErrorCodes::LOGICAL_ERROR, "addMilliseconds() cannot be used with Date");
}
static inline NO_SANITIZE_UNDEFINED DateTime64 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
static inline NO_SANITIZE_UNDEFINED Int8 execute(Int32, Int64, const DateLUTImpl &, UInt16 = 0)
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "addMilliSeconds() cannot be used with Date32");
throw Exception(ErrorCodes::LOGICAL_ERROR, "addMilliseconds() cannot be used with Date32");
}
};
@ -169,16 +135,10 @@ struct AddSecondsImpl
{
static constexpr auto name = "addSeconds";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
{
return {t.whole + delta, t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
{
return t + delta * DecimalUtils::scaleMultiplier<DateTime64>(scale);
return DateTime64(DecimalUtils::multiplyAdd(delta, DecimalUtils::scaleMultiplier<DateTime64>(scale), t.value));
}
static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt32 t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
@ -189,6 +149,7 @@ struct AddSecondsImpl
static inline NO_SANITIZE_UNDEFINED Int64 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
// use default datetime64 scale
static_assert(DataTypeDateTime64::default_scale == 3, "");
return (time_zone.fromDayNum(ExtendedDayNum(d)) + delta) * 1000;
}
@ -202,12 +163,6 @@ struct AddMinutesImpl
{
static constexpr auto name = "addMinutes";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
{
return {t.whole + delta * 60, t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
{
@ -222,6 +177,7 @@ struct AddMinutesImpl
static inline NO_SANITIZE_UNDEFINED Int64 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
// use default datetime64 scale
static_assert(DataTypeDateTime64::default_scale == 3, "");
return (time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 60) * 1000;
}
@ -235,12 +191,6 @@ struct AddHoursImpl
{
static constexpr auto name = "addHours";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &, UInt16 = 0)
{
return {t.whole + delta * 3600, t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl &, UInt16 scale = 0)
{
@ -255,6 +205,7 @@ struct AddHoursImpl
static inline NO_SANITIZE_UNDEFINED Int64 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
// use default datetime64 scale
static_assert(DataTypeDateTime64::default_scale == 3, "");
return (time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 3600) * 1000;
}
@ -268,12 +219,6 @@ struct AddDaysImpl
{
static constexpr auto name = "addDays";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
return {time_zone.addDays(t.whole, delta), t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
{
@ -302,12 +247,6 @@ struct AddWeeksImpl
{
static constexpr auto name = "addWeeks";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
return {time_zone.addWeeks(t.whole, delta), t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
{
@ -336,12 +275,6 @@ struct AddMonthsImpl
{
static constexpr auto name = "addMonths";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
return {time_zone.addMonths(t.whole, delta), t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
{
@ -370,12 +303,6 @@ struct AddQuartersImpl
{
static constexpr auto name = "addQuarters";
static inline DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
return {time_zone.addQuarters(t.whole, delta), t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
{
@ -404,12 +331,6 @@ struct AddYearsImpl
{
static constexpr auto name = "addYears";
static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone, UInt16 = 0)
{
return {time_zone.addYears(t.whole, delta), t.fractional};
}
static inline NO_SANITIZE_UNDEFINED DateTime64
execute(DateTime64 t, Int64 delta, const DateLUTImpl & time_zone, UInt16 scale = 0)
{
@ -581,11 +502,11 @@ namespace date_and_time_type_details
// Compile-time mapping of value (DataType::FieldType) types to corresponding DataType
template <typename FieldType> struct ResultDataTypeMap {};
template <> struct ResultDataTypeMap<UInt16> { using ResultDataType = DataTypeDate; };
template <> struct ResultDataTypeMap<Int16> { using ResultDataType = DataTypeDate; };
template <> struct ResultDataTypeMap<UInt32> { using ResultDataType = DataTypeDateTime; };
template <> struct ResultDataTypeMap<Int32> { using ResultDataType = DataTypeDate32; };
template <> struct ResultDataTypeMap<DateTime64> { using ResultDataType = DataTypeDateTime64; };
template <> struct ResultDataTypeMap<Int64> { using ResultDataType = DataTypeDateTime64; };
template <> struct ResultDataTypeMap<Int8> { using ResultDataType = DataTypeInt8; }; // error
}
template <typename Transform>
@ -705,6 +626,10 @@ public:
return std::make_shared<DataTypeDateTime64>(target_scale.value_or(DataTypeDateTime64::default_scale), std::move(timezone));
}
else if constexpr (std::is_same_v<ResultDataType, DataTypeInt8>)
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} cannot be used with {}", getName(), arguments[0].type->getName());
}
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected result type in datetime add interval function");
}

View File

@ -0,0 +1,141 @@
#include <Columns/ColumnConst.h>
#include <Columns/ColumnsDateTime.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/IColumn.h>
#include <Common/DateLUT.h>
#include <Common/LocalDateTime.h>
#include <Common/logger_useful.h>
#include <Core/DecimalFunctions.h>
#include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/TimezoneMixin.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
#include <Functions/FunctionHelpers.h>
#include <Interpreters/Context.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
namespace
{
template <typename Name>
class UTCTimestampTransform : public IFunction
{
public:
static FunctionPtr create(ContextPtr) { return std::make_shared<UTCTimestampTransform>(); }
static constexpr auto name = Name::name;
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
bool useDefaultImplementationForConstants() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() != 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {}'s arguments number must be 2.", name);
WhichDataType which_type_first(arguments[0]);
WhichDataType which_type_second(arguments[1]);
if (!which_type_first.isDateTime() && !which_type_first.isDateTime64())
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s 1st argument type must be datetime.", name);
if (dynamic_cast<const TimezoneMixin *>(arguments[0].get())->hasExplicitTimeZone())
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s 1st argument should not have explicit time zone.", name);
if (!which_type_second.isString())
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s 2nd argument type must be string.", name);
DataTypePtr date_time_type;
if (which_type_first.isDateTime())
date_time_type = std::make_shared<DataTypeDateTime>();
else
{
const DataTypeDateTime64 * date_time_64 = static_cast<const DataTypeDateTime64 *>(arguments[0].get());
date_time_type = std::make_shared<DataTypeDateTime64>(date_time_64->getScale());
}
return date_time_type;
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override
{
if (arguments.size() != 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {}'s arguments number must be 2.", name);
ColumnWithTypeAndName arg1 = arguments[0];
ColumnWithTypeAndName arg2 = arguments[1];
const auto * time_zone_const_col = checkAndGetColumnConstData<ColumnString>(arg2.column.get());
if (!time_zone_const_col)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of 2nd argument of function {}. Excepted const(String).", arg2.column->getName(), name);
String time_zone_val = time_zone_const_col->getDataAt(0).toString();
auto column = result_type->createColumn();
if (WhichDataType(arg1.type).isDateTime())
{
const auto * date_time_col = checkAndGetColumn<ColumnDateTime>(arg1.column.get());
for (size_t i = 0; i < date_time_col->size(); ++i)
{
UInt32 date_time_val = date_time_col->getElement(i);
LocalDateTime date_time(date_time_val, Name::to ? DateLUT::instance("UTC") : DateLUT::instance(time_zone_val));
time_t time_val = date_time.to_time_t(Name::from ? DateLUT::instance("UTC") : DateLUT::instance(time_zone_val));
column->insert(time_val);
}
}
else if (WhichDataType(arg1.type).isDateTime64())
{
const auto * date_time_col = checkAndGetColumn<ColumnDateTime64>(arg1.column.get());
const DataTypeDateTime64 * date_time_type = static_cast<const DataTypeDateTime64 *>(arg1.type.get());
Int64 scale_multiplier = DecimalUtils::scaleMultiplier<Int64>(date_time_type->getScale());
for (size_t i = 0; i < date_time_col->size(); ++i)
{
DateTime64 date_time_val = date_time_col->getElement(i);
Int64 seconds = date_time_val.value / scale_multiplier;
Int64 micros = date_time_val.value % scale_multiplier;
LocalDateTime date_time(seconds, Name::to ? DateLUT::instance("UTC") : DateLUT::instance(time_zone_val));
time_t time_val = date_time.to_time_t(Name::from ? DateLUT::instance("UTC") : DateLUT::instance(time_zone_val));
DateTime64 date_time_64(time_val * scale_multiplier + micros);
column->insert(date_time_64);
}
}
else
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}'s 1st argument can only be datetime/datatime64. ", name);
return column;
}
};
struct NameToUTCTimestamp
{
static constexpr auto name = "toUTCTimestamp";
static constexpr auto from = false;
static constexpr auto to = true;
};
struct NameFromUTCTimestamp
{
static constexpr auto name = "fromUTCTimestamp";
static constexpr auto from = true;
static constexpr auto to = false;
};
using ToUTCTimestampFunction = UTCTimestampTransform<NameToUTCTimestamp>;
using FromUTCTimestampFunction = UTCTimestampTransform<NameFromUTCTimestamp>;
}
REGISTER_FUNCTION(UTCTimestampTransform)
{
factory.registerFunction<ToUTCTimestampFunction>();
factory.registerFunction<FromUTCTimestampFunction>();
factory.registerAlias("to_utc_timestamp", NameToUTCTimestamp::name, FunctionFactory::CaseInsensitive);
factory.registerAlias("from_utc_timestamp", NameFromUTCTimestamp::name, FunctionFactory::CaseInsensitive);
}
}

View File

@ -7,6 +7,8 @@
#include <string_view>
#include <base/simd.h>
#ifdef __SSE2__
# include <emmintrin.h>
#endif
@ -73,16 +75,13 @@ struct ToValidUTF8Impl
/// Fast skip of ASCII for aarch64.
static constexpr size_t SIMD_BYTES = 16;
const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;
/// Returns a 64 bit mask of nibbles (4 bits for each byte).
auto get_nibble_mask = [](uint8x16_t input) -> uint64_t
{ return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
/// Other options include
/// vmaxvq_u8(input) < 0b10000000;
/// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
/// SIMDJSON uses it for 64 byte masks, so it's a little different.
/// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
/// shrn version has universally <=3 cycles, on servers 2 cycles.
while (p < simd_end && get_nibble_mask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
p += SIMD_BYTES;
if (!(p < end))

View File

@ -50,8 +50,8 @@ public:
/// Starts reading a file from the archive. The function returns a read buffer,
/// you can read that buffer to extract uncompressed data from the archive.
/// Several read buffers can be used at the same time in parallel.
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) = 0;
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) = 0;
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename, bool throw_on_not_found) = 0;
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter, bool throw_on_not_found) = 0;
/// It's possible to convert a file enumerator to a read buffer and vice versa.
virtual std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) = 0;

View File

@ -155,7 +155,7 @@ private:
archive_read_support_filter_all(archive);
archive_read_support_format_all(archive);
if (archive_read_open_filename(archive, path_to_archive.c_str(), 10240) != ARCHIVE_OK)
throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open archive: {}", quoteString(path_to_archive));
throw Exception(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't open archive {}: {}", quoteString(path_to_archive), archive_error_string(archive));
}
catch (...)
{
@ -293,17 +293,21 @@ std::unique_ptr<LibArchiveReader::FileEnumerator> LibArchiveReader::firstFile()
return std::make_unique<FileEnumeratorImpl>(std::move(handle));
}
std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(const String & filename)
std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(const String & filename, bool throw_on_not_found)
{
return readFile([&](const std::string & file) { return file == filename; });
return readFile([&](const std::string & file) { return file == filename; }, throw_on_not_found);
}
std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(NameFilter filter)
std::unique_ptr<ReadBufferFromFileBase> LibArchiveReader::readFile(NameFilter filter, bool throw_on_not_found)
{
Handle handle(path_to_archive, lock_on_reading);
if (!handle.locateFile(filter))
throw Exception(
ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: no file found satisfying the filter", path_to_archive);
{
if (throw_on_not_found)
throw Exception(
ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Couldn't unpack archive {}: no file found satisfying the filter", path_to_archive);
return nullptr;
}
return std::make_unique<ReadBufferFromLibArchive>(std::move(handle), path_to_archive);
}

View File

@ -34,8 +34,8 @@ public:
/// Starts reading a file from the archive. The function returns a read buffer,
/// you can read that buffer to extract uncompressed data from the archive.
/// Several read buffers can be used at the same time in parallel.
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename, bool throw_on_not_found) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter, bool throw_on_not_found) override;
/// It's possible to convert a file enumerator to a read buffer and vice versa.
std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;

View File

@ -75,21 +75,22 @@ public:
RawHandle getRawHandle() const { return raw_handle; }
std::shared_ptr<ZipArchiveReader> getReader() const { return reader; }
void locateFile(const String & file_name_)
bool locateFile(const String & file_name_)
{
resetFileInfo();
bool case_sensitive = true;
int err = unzLocateFile(raw_handle, file_name_.c_str(), reinterpret_cast<unzFileNameComparer>(static_cast<size_t>(case_sensitive)));
if (err == UNZ_END_OF_LIST_OF_FILE)
showError("File " + quoteString(file_name_) + " not found");
return false;
file_name = file_name_;
return true;
}
void locateFile(NameFilter filter)
bool locateFile(NameFilter filter)
{
int err = unzGoToFirstFile(raw_handle);
if (err == UNZ_END_OF_LIST_OF_FILE)
showError("No file was found satisfying the filter");
return false;
do
{
@ -97,12 +98,12 @@ public:
resetFileInfo();
retrieveFileInfo();
if (filter(getFileName()))
return;
return true;
err = unzGoToNextFile(raw_handle);
} while (err != UNZ_END_OF_LIST_OF_FILE);
showError("No file was found satisfying the filter");
return false;
}
bool tryLocateFile(const String & file_name_)
@ -513,7 +514,9 @@ bool ZipArchiveReader::fileExists(const String & filename)
ZipArchiveReader::FileInfo ZipArchiveReader::getFileInfo(const String & filename)
{
auto handle = acquireHandle();
handle.locateFile(filename);
if (!handle.locateFile(filename))
showError(fmt::format("File {} was not found in archive", quoteString(filename)));
return handle.getFileInfo();
}
@ -525,17 +528,31 @@ std::unique_ptr<ZipArchiveReader::FileEnumerator> ZipArchiveReader::firstFile()
return std::make_unique<FileEnumeratorImpl>(std::move(handle));
}
std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(const String & filename)
std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(const String & filename, bool throw_on_not_found)
{
auto handle = acquireHandle();
handle.locateFile(filename);
if (!handle.locateFile(filename))
{
if (throw_on_not_found)
showError(fmt::format("File {} was not found in archive", quoteString(filename)));
return nullptr;
}
return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
}
std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(NameFilter filter)
std::unique_ptr<ReadBufferFromFileBase> ZipArchiveReader::readFile(NameFilter filter, bool throw_on_not_found)
{
auto handle = acquireHandle();
handle.locateFile(filter);
if (!handle.locateFile(filter))
{
if (throw_on_not_found)
showError(fmt::format("No file satisfying filter in archive"));
return nullptr;
}
return std::make_unique<ReadBufferFromZipArchive>(std::move(handle));
}

View File

@ -41,8 +41,8 @@ public:
/// Starts reading a file from the archive. The function returns a read buffer,
/// you can read that buffer to extract uncompressed data from the archive.
/// Several read buffers can be used at the same time in parallel.
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(const String & filename, bool throw_on_not_found) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(NameFilter filter, bool throw_on_not_found) override;
/// It's possible to convert a file enumerator to a read buffer and vice versa.
std::unique_ptr<ReadBufferFromFileBase> readFile(std::unique_ptr<FileEnumerator> enumerator) override;

View File

@ -24,6 +24,18 @@ std::shared_ptr<IArchiveReader> createArchiveReader(
[[maybe_unused]] const std::function<std::unique_ptr<SeekableReadBuffer>()> & archive_read_function,
[[maybe_unused]] size_t archive_size)
{
using namespace std::literals;
static constexpr std::array tar_extensions
{
".tar"sv,
".tar.gz"sv,
".tgz"sv,
".tar.zst"sv,
".tzst"sv,
".tar.xz"sv,
".tar.bz2"sv
};
if (path_to_archive.ends_with(".zip") || path_to_archive.ends_with(".zipx"))
{
#if USE_MINIZIP
@ -32,7 +44,8 @@ std::shared_ptr<IArchiveReader> createArchiveReader(
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "minizip library is disabled");
#endif
}
else if (path_to_archive.ends_with(".tar") || path_to_archive.ends_with("tar.gz"))
else if (std::any_of(
tar_extensions.begin(), tar_extensions.end(), [&](const auto extension) { return path_to_archive.ends_with(extension); }))
{
#if USE_LIBARCHIVE
return std::make_shared<TarArchiveReader>(path_to_archive);

View File

@ -19,7 +19,10 @@ public:
class ReadBufferFromOwnString : public String, public ReadBufferFromString
{
public:
explicit ReadBufferFromOwnString(const String & s_): String(s_), ReadBufferFromString(*this) {}
template <typename S>
explicit ReadBufferFromOwnString(S && s_) : String(std::forward<S>(s_)), ReadBufferFromString(*this)
{
}
};
}

View File

@ -12,6 +12,8 @@
#include <cstdlib>
#include <bit>
#include <base/simd.h>
#ifdef __SSE2__
#include <emmintrin.h>
#endif
@ -819,14 +821,11 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV &
auto rc = vdupq_n_u8('\r');
auto nc = vdupq_n_u8('\n');
auto dc = vdupq_n_u8(delimiter);
/// Returns a 64 bit mask of nibbles (4 bits for each byte).
auto get_nibble_mask = [](uint8x16_t input) -> uint64_t
{ return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
for (; next_pos + 15 < buf.buffer().end(); next_pos += 16)
{
uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(next_pos));
auto eq = vorrq_u8(vorrq_u8(vceqq_u8(bytes, rc), vceqq_u8(bytes, nc)), vceqq_u8(bytes, dc));
uint64_t bit_mask = get_nibble_mask(eq);
uint64_t bit_mask = getNibbleMask(eq);
if (bit_mask)
{
next_pos += std::countr_zero(bit_mask) >> 2;

View File

@ -1,6 +1,7 @@
#include <Poco/UTF8Encoding.h>
#include <IO/WriteBufferValidUTF8.h>
#include <base/types.h>
#include <base/simd.h>
#ifdef __SSE2__
#include <emmintrin.h>
@ -84,16 +85,13 @@ void WriteBufferValidUTF8::nextImpl()
/// Fast skip of ASCII for aarch64.
static constexpr size_t SIMD_BYTES = 16;
const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;
/// Returns a 64 bit mask of nibbles (4 bits for each byte).
auto get_nibble_mask = [](uint8x16_t input) -> uint64_t
{ return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(input), 4)), 0); };
/// Other options include
/// vmaxvq_u8(input) < 0b10000000;
/// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
/// SIMDJSON uses it for 64 byte masks, so it's a little different.
/// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
/// shrn version has universally <=3 cycles, on servers 2 cycles.
while (p < simd_end && get_nibble_mask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
p += SIMD_BYTES;
if (!(p < pos))

View File

@ -113,11 +113,11 @@ TEST_P(ArchiveReaderAndWriterTest, EmptyArchive)
EXPECT_FALSE(reader->fileExists("nofile.txt"));
expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' not found",
expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' was not found in archive",
[&]{ reader->getFileInfo("nofile.txt"); });
expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' not found",
[&]{ reader->readFile("nofile.txt"); });
expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "File 'nofile.txt' was not found in archive",
[&]{ reader->readFile("nofile.txt", /*throw_on_not_found=*/true); });
EXPECT_EQ(reader->firstFile(), nullptr);
}
@ -145,7 +145,7 @@ TEST_P(ArchiveReaderAndWriterTest, SingleFileInArchive)
EXPECT_GT(file_info.compressed_size, 0);
{
auto in = reader->readFile("a.txt");
auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents);
@ -215,14 +215,14 @@ TEST_P(ArchiveReaderAndWriterTest, TwoFilesInArchive)
EXPECT_EQ(reader->getFileInfo("b/c.txt").uncompressed_size, c_contents.size());
{
auto in = reader->readFile("a.txt");
auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, a_contents);
}
{
auto in = reader->readFile("b/c.txt");
auto in = reader->readFile("b/c.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, c_contents);
@ -230,7 +230,7 @@ TEST_P(ArchiveReaderAndWriterTest, TwoFilesInArchive)
{
/// Read a.txt again.
auto in = reader->readFile("a.txt");
auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, a_contents);
@ -302,14 +302,14 @@ TEST_P(ArchiveReaderAndWriterTest, InMemory)
EXPECT_EQ(reader->getFileInfo("b.txt").uncompressed_size, b_contents.size());
{
auto in = reader->readFile("a.txt");
auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, a_contents);
}
{
auto in = reader->readFile("b.txt");
auto in = reader->readFile("b.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, b_contents);
@ -317,7 +317,7 @@ TEST_P(ArchiveReaderAndWriterTest, InMemory)
{
/// Read a.txt again.
auto in = reader->readFile("a.txt");
auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, a_contents);
@ -343,19 +343,19 @@ TEST_P(ArchiveReaderAndWriterTest, Password)
/// Try to read without a password.
expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Password is required",
[&]{ reader->readFile("a.txt"); });
[&]{ reader->readFile("a.txt", /*throw_on_not_found=*/true); });
{
/// Try to read with a wrong password.
reader->setPassword("123Qwe");
expectException(ErrorCodes::CANNOT_UNPACK_ARCHIVE, "Wrong password",
[&]{ reader->readFile("a.txt"); });
[&]{ reader->readFile("a.txt", /*throw_on_not_found=*/true); });
}
{
/// Reading with the right password is successful.
reader->setPassword("Qwe123");
auto in = reader->readFile("a.txt");
auto in = reader->readFile("a.txt", /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents);
@ -387,7 +387,7 @@ TEST(TarArchiveReaderTest, ReadFile) {
bool created = createArchiveWithFiles<ArchiveType::Tar>(archive_path, {{filename, contents}});
EXPECT_EQ(created, true);
auto reader = createArchiveReader(archive_path);
auto in = reader->readFile(filename);
auto in = reader->readFile(filename, /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents);
@ -405,11 +405,11 @@ TEST(TarArchiveReaderTest, ReadTwoFiles) {
auto reader = createArchiveReader(archive_path);
EXPECT_EQ(reader->fileExists(file1), true);
EXPECT_EQ(reader->fileExists(file2), true);
auto in = reader->readFile(file1);
auto in = reader->readFile(file1, /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents1);
in = reader->readFile(file2);
in = reader->readFile(file2, /*throw_on_not_found=*/true);
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents2);
@ -448,7 +448,7 @@ TEST(SevenZipArchiveReaderTest, ReadFile) {
bool created = createArchiveWithFiles<ArchiveType::SevenZip>(archive_path, {{filename, contents}});
EXPECT_EQ(created, true);
auto reader = createArchiveReader(archive_path);
auto in = reader->readFile(filename);
auto in = reader->readFile(filename, /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents);
@ -479,11 +479,11 @@ TEST(SevenZipArchiveReaderTest, ReadTwoFiles) {
auto reader = createArchiveReader(archive_path);
EXPECT_EQ(reader->fileExists(file1), true);
EXPECT_EQ(reader->fileExists(file2), true);
auto in = reader->readFile(file1);
auto in = reader->readFile(file1, /*throw_on_not_found=*/true);
String str;
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents1);
in = reader->readFile(file2);
in = reader->readFile(file2, /*throw_on_not_found=*/true);
readStringUntilEOF(str, *in);
EXPECT_EQ(str, contents2);

View File

@ -471,6 +471,21 @@ std::unique_ptr<SourceFromChunks> QueryCache::Reader::getSourceExtremes()
return std::move(source_from_chunks_extremes);
}
QueryCache::QueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
: cache(std::make_unique<TTLCachePolicy<Key, Entry, KeyHasher, QueryCacheEntryWeight, IsStale>>(std::make_unique<PerUserTTLCachePolicyUserQuota>()))
{
updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes_, max_entry_size_in_rows_);
}
void QueryCache::updateConfiguration(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
{
std::lock_guard lock(mutex);
cache.setMaxSize(max_size_in_bytes);
cache.setMaxCount(max_entries);
max_entry_size_in_bytes = max_entry_size_in_bytes_;
max_entry_size_in_rows = max_entry_size_in_rows_;
}
QueryCache::Reader QueryCache::createReader(const Key & key)
{
std::lock_guard lock(mutex);
@ -488,9 +503,9 @@ QueryCache::Writer QueryCache::createWriter(const Key & key, std::chrono::millis
return Writer(cache, key, max_entry_size_in_bytes, max_entry_size_in_rows, min_query_runtime, squash_partial_results, max_block_size);
}
void QueryCache::reset()
void QueryCache::clear()
{
cache.reset();
cache.clear();
std::lock_guard lock(mutex);
times_executed.clear();
}
@ -521,19 +536,4 @@ std::vector<QueryCache::Cache::KeyMapped> QueryCache::dump() const
return cache.dump();
}
QueryCache::QueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
: cache(std::make_unique<TTLCachePolicy<Key, Entry, KeyHasher, QueryCacheEntryWeight, IsStale>>(std::make_unique<PerUserTTLCachePolicyUserQuota>()))
{
updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes_, max_entry_size_in_rows_);
}
void QueryCache::updateConfiguration(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
{
std::lock_guard lock(mutex);
cache.setMaxSize(max_size_in_bytes);
cache.setMaxCount(max_entries);
max_entry_size_in_bytes = max_entry_size_in_bytes_;
max_entry_size_in_rows = max_entry_size_in_rows_;
}
}

View File

@ -180,7 +180,7 @@ public:
Reader createReader(const Key & key);
Writer createWriter(const Key & key, std::chrono::milliseconds min_query_runtime, bool squash_partial_results, size_t max_block_size, size_t max_query_cache_size_in_bytes_quota, size_t max_query_cache_entries_quota);
void reset();
void clear();
size_t weight() const;
size_t count() const;

View File

@ -548,7 +548,7 @@ struct ContextSharedPart : boost::noncopyable
*/
#if USE_EMBEDDED_COMPILER
if (auto * cache = CompiledExpressionCacheFactory::instance().tryGetCache())
cache->reset();
cache->clear();
#endif
/// Preemptive destruction is important, because these objects may have a refcount to ContextShared (cyclic reference).
@ -2278,6 +2278,16 @@ void Context::setUncompressedCache(const String & uncompressed_cache_policy, siz
shared->uncompressed_cache = std::make_shared<UncompressedCache>(uncompressed_cache_policy, max_size_in_bytes);
}
void Context::updateUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (!shared->uncompressed_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Uncompressed cache was not created yet.");
size_t max_size_in_bytes = config.getUInt64("uncompressed_cache_size", DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE);
shared->uncompressed_cache->setMaxSize(max_size_in_bytes);
}
UncompressedCachePtr Context::getUncompressedCache() const
{
@ -2285,14 +2295,13 @@ UncompressedCachePtr Context::getUncompressedCache() const
return shared->uncompressed_cache;
}
void Context::clearUncompressedCache() const
{
auto lock = getLock();
if (shared->uncompressed_cache)
shared->uncompressed_cache->reset();
}
if (shared->uncompressed_cache)
shared->uncompressed_cache->clear();
}
void Context::setMarkCache(const String & mark_cache_policy, size_t cache_size_in_bytes)
{
@ -2304,6 +2313,17 @@ void Context::setMarkCache(const String & mark_cache_policy, size_t cache_size_i
shared->mark_cache = std::make_shared<MarkCache>(mark_cache_policy, cache_size_in_bytes);
}
void Context::updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (!shared->mark_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mark cache was not created yet.");
size_t max_size_in_bytes = config.getUInt64("mark_cache_size", DEFAULT_MARK_CACHE_MAX_SIZE);
shared->mark_cache->setMaxSize(max_size_in_bytes);
}
MarkCachePtr Context::getMarkCache() const
{
auto lock = getLock();
@ -2313,8 +2333,9 @@ MarkCachePtr Context::getMarkCache() const
void Context::clearMarkCache() const
{
auto lock = getLock();
if (shared->mark_cache)
shared->mark_cache->reset();
shared->mark_cache->clear();
}
ThreadPool & Context::getLoadMarksThreadpool() const
@ -2342,20 +2363,30 @@ void Context::setIndexUncompressedCache(size_t max_size_in_bytes)
shared->index_uncompressed_cache = std::make_shared<UncompressedCache>(max_size_in_bytes);
}
void Context::updateIndexUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (!shared->index_uncompressed_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index uncompressed cache was not created yet.");
size_t max_size_in_bytes = config.getUInt64("index_uncompressed_cache_size", DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE);
shared->index_uncompressed_cache->setMaxSize(max_size_in_bytes);
}
UncompressedCachePtr Context::getIndexUncompressedCache() const
{
auto lock = getLock();
return shared->index_uncompressed_cache;
}
void Context::clearIndexUncompressedCache() const
{
auto lock = getLock();
if (shared->index_uncompressed_cache)
shared->index_uncompressed_cache->reset();
}
if (shared->index_uncompressed_cache)
shared->index_uncompressed_cache->clear();
}
void Context::setIndexMarkCache(size_t cache_size_in_bytes)
{
@ -2367,6 +2398,17 @@ void Context::setIndexMarkCache(size_t cache_size_in_bytes)
shared->index_mark_cache = std::make_shared<MarkCache>(cache_size_in_bytes);
}
void Context::updateIndexMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (!shared->index_mark_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index mark cache was not created yet.");
size_t max_size_in_bytes = config.getUInt64("index_mark_cache_size", DEFAULT_INDEX_MARK_CACHE_MAX_SIZE);
shared->index_mark_cache->setMaxSize(max_size_in_bytes);
}
MarkCachePtr Context::getIndexMarkCache() const
{
auto lock = getLock();
@ -2376,8 +2418,9 @@ MarkCachePtr Context::getIndexMarkCache() const
void Context::clearIndexMarkCache() const
{
auto lock = getLock();
if (shared->index_mark_cache)
shared->index_mark_cache->reset();
shared->index_mark_cache->clear();
}
void Context::setMMappedFileCache(size_t cache_size_in_num_entries)
@ -2390,6 +2433,17 @@ void Context::setMMappedFileCache(size_t cache_size_in_num_entries)
shared->mmap_cache = std::make_shared<MMappedFileCache>(cache_size_in_num_entries);
}
void Context::updateMMappedFileCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (!shared->mmap_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mapped file cache was not created yet.");
size_t max_size_in_bytes = config.getUInt64("mmap_cache_size", DEFAULT_MMAP_CACHE_MAX_SIZE);
shared->mmap_cache->setMaxSize(max_size_in_bytes);
}
MMappedFileCachePtr Context::getMMappedFileCache() const
{
auto lock = getLock();
@ -2399,8 +2453,9 @@ MMappedFileCachePtr Context::getMMappedFileCache() const
void Context::clearMMappedFileCache() const
{
auto lock = getLock();
if (shared->mmap_cache)
shared->mmap_cache->reset();
shared->mmap_cache->clear();
}
void Context::setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes, size_t max_entry_size_in_rows)
@ -2416,14 +2471,15 @@ void Context::setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t
void Context::updateQueryCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (shared->query_cache)
{
size_t max_size_in_bytes = config.getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
size_t max_entries = config.getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
size_t max_entry_size_in_bytes = config.getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
size_t max_entry_size_in_rows = config.getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
shared->query_cache->updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes, max_entry_size_in_rows);
}
if (!shared->query_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Query cache was not created yet.");
size_t max_size_in_bytes = config.getUInt64("query_cache.max_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_SIZE);
size_t max_entries = config.getUInt64("query_cache.max_entries", DEFAULT_QUERY_CACHE_MAX_ENTRIES);
size_t max_entry_size_in_bytes = config.getUInt64("query_cache.max_entry_size_in_bytes", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_BYTES);
size_t max_entry_size_in_rows = config.getUInt64("query_cache.max_entry_rows_in_rows", DEFAULT_QUERY_CACHE_MAX_ENTRY_SIZE_IN_ROWS);
shared->query_cache->updateConfiguration(max_size_in_bytes, max_entries, max_entry_size_in_bytes, max_entry_size_in_rows);
}
QueryCachePtr Context::getQueryCache() const
@ -2435,30 +2491,36 @@ QueryCachePtr Context::getQueryCache() const
void Context::clearQueryCache() const
{
auto lock = getLock();
if (shared->query_cache)
shared->query_cache->reset();
shared->query_cache->clear();
}
void Context::clearCaches() const
{
auto lock = getLock();
if (shared->uncompressed_cache)
shared->uncompressed_cache->reset();
if (!shared->uncompressed_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Uncompressed cache was not created yet.");
shared->uncompressed_cache->clear();
if (shared->mark_cache)
shared->mark_cache->reset();
if (!shared->mark_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mark cache was not created yet.");
shared->mark_cache->clear();
if (shared->index_uncompressed_cache)
shared->index_uncompressed_cache->reset();
if (!shared->index_uncompressed_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index uncompressed cache was not created yet.");
shared->index_uncompressed_cache->clear();
if (shared->index_mark_cache)
shared->index_mark_cache->reset();
if (!shared->index_mark_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index mark cache was not created yet.");
shared->index_mark_cache->clear();
if (shared->mmap_cache)
shared->mmap_cache->reset();
if (!shared->mmap_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mmapped file cache was not created yet.");
shared->mmap_cache->clear();
/// Intentionally not dropping the query cache which is transactionally inconsistent by design.
/// Intentionally not clearing the query cache which is transactionally inconsistent by design.
}
ThreadPool & Context::getPrefetchThreadpool() const

View File

@ -922,33 +922,32 @@ public:
/// --- Caches ------------------------------------------------------------------------------------------
/// Create a cache of uncompressed blocks of specified size. This can be done only once.
void setUncompressedCache(const String & uncompressed_cache_policy, size_t max_size_in_bytes);
void updateUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<UncompressedCache> getUncompressedCache() const;
void clearUncompressedCache() const;
/// Create a cache of marks of specified size. This can be done only once.
void setMarkCache(const String & mark_cache_policy, size_t cache_size_in_bytes);
void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<MarkCache> getMarkCache() const;
void clearMarkCache() const;
ThreadPool & getLoadMarksThreadpool() const;
/// Create a cache of index uncompressed blocks of specified size. This can be done only once.
void setIndexUncompressedCache(size_t max_size_in_bytes);
void updateIndexUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<UncompressedCache> getIndexUncompressedCache() const;
void clearIndexUncompressedCache() const;
/// Create a cache of index marks of specified size. This can be done only once.
void setIndexMarkCache(size_t cache_size_in_bytes);
void updateIndexMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<MarkCache> getIndexMarkCache() const;
void clearIndexMarkCache() const;
/// Create a cache of mapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
void setMMappedFileCache(size_t cache_size_in_num_entries);
void updateMMappedFileCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<MMappedFileCache> getMMappedFileCache() const;
void clearMMappedFileCache() const;
/// Create a cache of query results for statements which run repeatedly.
void setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes, size_t max_entry_size_in_rows);
void updateQueryCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<QueryCache> getQueryCache() const;

View File

@ -341,14 +341,10 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
{
TableNameHints hints(this->tryGetDatabase(table_id.getDatabaseName()), getContext());
std::vector<String> names = hints.getHints(table_id.getTableName());
if (!names.empty())
{
/// There is two options: first is to print just the name of the table
/// and the second is to print the result in format: db_name.table_name. I'll comment out the second option below
/// I also leave possibility to print several suggestions
if (names.empty())
exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_id.getNameForLogs()));
else
exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist. Maybe you meant {}?", table_id.getNameForLogs(), backQuoteIfNeed(names[0])));
}
else exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_id.getNameForLogs()));
}
return {};
}

View File

@ -704,6 +704,9 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti
if (index_desc.type == "annoy" && !settings.allow_experimental_annoy_index)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index is disabled. Turn on allow_experimental_annoy_index");
if (index_desc.type == "usearch" && !settings.allow_experimental_usearch_index)
throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index is disabled. Turn on allow_experimental_usearch_index");
properties.indices.push_back(index_desc);
}
if (create.columns_list->projections)

View File

@ -345,7 +345,7 @@ BlockIO InterpreterSystemQuery::execute()
case Type::DROP_COMPILED_EXPRESSION_CACHE:
getContext()->checkAccess(AccessType::SYSTEM_DROP_COMPILED_EXPRESSION_CACHE);
if (auto * cache = CompiledExpressionCacheFactory::instance().tryGetCache())
cache->reset();
cache->clear();
break;
#endif
#if USE_AWS_S3

View File

@ -6,6 +6,7 @@
#include <Interpreters/Cache/FileCache.h>
#include <Interpreters/Cache/FileCacheFactory.h>
#include <Interpreters/Context.h>
#include <Interpreters/Cache/QueryCache.h>
#include <Interpreters/JIT/CompiledExpressionCache.h>
#include <Databases/IDatabase.h>

View File

@ -64,8 +64,8 @@ inline bool operator==(SmallStringRef lhs, SmallStringRef rhs)
if (lhs.size == 0)
return true;
#ifdef __SSE2__
return memequalSSE2Wide(lhs.data(), rhs.data(), lhs.size);
#if defined(__SSE2__) || (defined(__aarch64__) && defined(__ARM_NEON))
return memequalWide(lhs.data(), rhs.data(), lhs.size);
#else
return 0 == memcmp(lhs.data(), rhs.data(), lhs.size);
#endif

View File

@ -14,6 +14,7 @@ class ASTIndexDeclaration : public IAST
public:
static const auto DEFAULT_INDEX_GRANULARITY = 1uz;
static const auto DEFAULT_ANNOY_INDEX_GRANULARITY = 100'000'000uz;
static const auto DEFAULT_USEARCH_INDEX_GRANULARITY = 100'000'000uz;
String name;
IAST * expr;

View File

@ -204,7 +204,7 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &,
}
else if (type == Type::SUSPEND)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << " FOR "
settings.ostr << (settings.hilite ? hilite_keyword : "") << " FOR "
<< (settings.hilite ? hilite_none : "") << seconds
<< (settings.hilite ? hilite_keyword : "") << " SECOND"
<< (settings.hilite ? hilite_none : "");
@ -232,12 +232,50 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &,
}
else if (type == Type::START_LISTEN || type == Type::STOP_LISTEN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << " " << ServerType::serverTypeToString(server_type.type)
<< (settings.hilite ? hilite_none : "");
settings.ostr << (settings.hilite ? hilite_keyword : "") << " "
<< ServerType::serverTypeToString(server_type.type) << (settings.hilite ? hilite_none : "");
if (server_type.type == ServerType::CUSTOM)
if (server_type.type == ServerType::Type::CUSTOM)
{
settings.ostr << (settings.hilite ? hilite_identifier : "") << " " << backQuoteIfNeed(server_type.custom_name);
settings.ostr << " " << quoteString(server_type.custom_name);
}
bool comma = false;
if (!server_type.exclude_types.empty())
{
settings.ostr << (settings.hilite ? hilite_keyword : "")
<< " EXCEPT" << (settings.hilite ? hilite_none : "");
for (auto cur_type : server_type.exclude_types)
{
if (cur_type == ServerType::Type::CUSTOM)
continue;
if (comma)
settings.ostr << ",";
else
comma = true;
settings.ostr << (settings.hilite ? hilite_keyword : "") << " "
<< ServerType::serverTypeToString(cur_type) << (settings.hilite ? hilite_none : "");
}
if (server_type.exclude_types.contains(ServerType::Type::CUSTOM))
{
for (const auto & cur_name : server_type.exclude_custom_names)
{
if (comma)
settings.ostr << ",";
else
comma = true;
settings.ostr << (settings.hilite ? hilite_keyword : "") << " "
<< ServerType::serverTypeToString(ServerType::Type::CUSTOM) << (settings.hilite ? hilite_none : "");
settings.ostr << " " << quoteString(cur_name);
}
}
}
}

View File

@ -66,6 +66,8 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected
{
if (index->type && index->type->name == "annoy")
index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
else if (index->type && index->type->name == "usearch")
index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
else
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
}

View File

@ -148,6 +148,8 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe
{
if (index->type->name == "annoy")
index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
else if (index->type->name == "usearch")
index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
else
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
}

View File

@ -458,32 +458,71 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected &
if (!parseQueryWithOnCluster(res, pos, expected))
return false;
ServerType::Type current_type = ServerType::Type::END;
std::string current_custom_name;
for (const auto & type : magic_enum::enum_values<ServerType::Type>())
auto parse_server_type = [&](ServerType::Type & type, std::string & custom_name) -> bool
{
if (ParserKeyword{ServerType::serverTypeToString(type)}.ignore(pos, expected))
type = ServerType::Type::END;
custom_name = "";
for (const auto & cur_type : magic_enum::enum_values<ServerType::Type>())
{
current_type = type;
break;
if (ParserKeyword{ServerType::serverTypeToString(cur_type)}.ignore(pos, expected))
{
type = cur_type;
break;
}
}
if (type == ServerType::Type::END)
return false;
if (type == ServerType::CUSTOM)
{
ASTPtr ast;
if (!ParserStringLiteral{}.parse(pos, ast, expected))
return false;
custom_name = ast->as<ASTLiteral &>().value.get<const String &>();
}
return true;
};
ServerType::Type base_type;
std::string base_custom_name;
ServerType::Types exclude_type;
ServerType::CustomNames exclude_custom_names;
if (!parse_server_type(base_type, base_custom_name))
return false;
if (ParserKeyword{"EXCEPT"}.ignore(pos, expected))
{
if (base_type != ServerType::Type::QUERIES_ALL &&
base_type != ServerType::Type::QUERIES_DEFAULT &&
base_type != ServerType::Type::QUERIES_CUSTOM)
return false;
ServerType::Type current_type;
std::string current_custom_name;
while (true)
{
if (!exclude_type.empty() && !ParserToken(TokenType::Comma).ignore(pos, expected))
break;
if (!parse_server_type(current_type, current_custom_name))
return false;
exclude_type.insert(current_type);
if (current_type == ServerType::Type::CUSTOM)
exclude_custom_names.insert(current_custom_name);
}
}
if (current_type == ServerType::Type::END)
return false;
if (current_type == ServerType::CUSTOM)
{
ASTPtr ast;
if (!ParserStringLiteral{}.parse(pos, ast, expected))
return false;
current_custom_name = ast->as<ASTLiteral &>().value.get<const String &>();
}
res->server_type = ServerType(current_type, current_custom_name);
res->server_type = ServerType(base_type, base_custom_name, exclude_type, exclude_custom_names);
break;
}

View File

@ -75,10 +75,13 @@ void DelayedPortsProcessor::finishPair(PortsPair & pair)
pair.input_port->close();
pair.is_finished = true;
++num_finished_pairs;
++num_finished_inputs;
if (pair.output_port)
++num_finished_outputs;
if (!pair.is_delayed)
++num_finished_main_inputs;
}
}
@ -112,9 +115,15 @@ bool DelayedPortsProcessor::processPair(PortsPair & pair)
return true;
}
bool DelayedPortsProcessor::shouldSkipDelayed() const
{
return num_finished_main_inputs + num_delayed_ports < port_pairs.size();
}
IProcessor::Status DelayedPortsProcessor::prepare(const PortNumbers & updated_inputs, const PortNumbers & updated_outputs)
{
bool skip_delayed = (num_finished_pairs + num_delayed_ports) < port_pairs.size();
bool skip_delayed = shouldSkipDelayed();
bool need_data = false;
if (!are_inputs_initialized && !updated_outputs.empty())
@ -154,14 +163,14 @@ IProcessor::Status DelayedPortsProcessor::prepare(const PortNumbers & updated_in
}
/// In case if main streams are finished at current iteration, start processing delayed streams.
if (skip_delayed && (num_finished_pairs + num_delayed_ports) >= port_pairs.size())
if (skip_delayed && !shouldSkipDelayed())
{
for (auto & pair : port_pairs)
if (pair.is_delayed)
need_data = processPair(pair) || need_data;
}
if (num_finished_pairs == port_pairs.size())
if (num_finished_inputs == port_pairs.size())
return Status::Finished;
if (need_data)

View File

@ -29,14 +29,16 @@ private:
std::vector<PortsPair> port_pairs;
const size_t num_delayed_ports;
size_t num_finished_pairs = 0;
size_t num_finished_inputs = 0;
size_t num_finished_outputs = 0;
size_t num_finished_main_inputs = 0;
std::vector<size_t> output_to_pair;
bool are_inputs_initialized = false;
bool processPair(PortsPair & pair);
void finishPair(PortsPair & pair);
bool shouldSkipDelayed() const;
};
}

View File

@ -10,6 +10,8 @@
namespace DB
{
struct SelectQueryInfo;
using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;
/** Input format is a source, that reads data from ReadBuffer.
@ -21,9 +23,13 @@ protected:
ReadBuffer * in [[maybe_unused]] = nullptr;
public:
// ReadBuffer can be nullptr for random-access formats.
/// ReadBuffer can be nullptr for random-access formats.
IInputFormat(Block header, ReadBuffer * in_);
/// If the format is used by a SELECT query, this method may be called.
/// The format may use it for filter pushdown.
virtual void setQueryInfo(const SelectQueryInfo &, ContextPtr) {}
/** In some usecase (hello Kafka) we need to read a lot of tiny streams in exactly the same format.
* The recreating of parser for each small stream takes too long, so we introduce a method
* resetParser() which allow to reset the state of parser to continue reading of

View File

@ -115,21 +115,24 @@ NamesAndTypesList IRowSchemaReader::readSchema()
"Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. "
"Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0");
DataTypes data_types = readRowAndGetDataTypes();
auto data_types_maybe = readRowAndGetDataTypes();
/// Check that we read at list one column.
if (data_types.empty())
if (!data_types_maybe)
throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data");
DataTypes data_types = std::move(*data_types_maybe);
/// If column names weren't set, use default names 'c1', 'c2', ...
if (column_names.empty())
bool use_default_column_names = column_names.empty();
if (use_default_column_names)
{
column_names.reserve(data_types.size());
for (size_t i = 0; i != data_types.size(); ++i)
column_names.push_back("c" + std::to_string(i + 1));
}
/// If column names were set, check that the number of names match the number of types.
else if (column_names.size() != data_types.size())
else if (column_names.size() != data_types.size() && !allowVariableNumberOfColumns())
{
throw Exception(
ErrorCodes::INCORRECT_DATA,
@ -137,6 +140,9 @@ NamesAndTypesList IRowSchemaReader::readSchema()
}
else
{
if (column_names.size() != data_types.size())
data_types.resize(column_names.size());
std::unordered_set<std::string_view> names_set;
for (const auto & name : column_names)
{
@ -155,13 +161,39 @@ NamesAndTypesList IRowSchemaReader::readSchema()
for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read)
{
DataTypes new_data_types = readRowAndGetDataTypes();
if (new_data_types.empty())
auto new_data_types_maybe = readRowAndGetDataTypes();
if (!new_data_types_maybe)
/// We reached eof.
break;
DataTypes new_data_types = std::move(*new_data_types_maybe);
if (new_data_types.size() != data_types.size())
throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values");
{
if (!allowVariableNumberOfColumns())
throw Exception(ErrorCodes::INCORRECT_DATA, "Rows have different amount of values");
if (use_default_column_names)
{
/// Current row contains new columns, add new default names.
if (new_data_types.size() > data_types.size())
{
for (size_t i = data_types.size(); i < new_data_types.size(); ++i)
column_names.push_back("c" + std::to_string(i + 1));
data_types.resize(new_data_types.size());
}
/// Current row contain less columns than previous rows.
else
{
new_data_types.resize(data_types.size());
}
}
/// If names were explicitly set, ignore all extra columns.
else
{
new_data_types.resize(column_names.size());
}
}
for (field_index = 0; field_index != data_types.size(); ++field_index)
{

View File

@ -93,11 +93,13 @@ protected:
/// Read one row and determine types of columns in it.
/// Return types in the same order in which the values were in the row.
/// If it's impossible to determine the type for some column, return nullptr for it.
/// Return empty list if can't read more data.
virtual DataTypes readRowAndGetDataTypes() = 0;
/// Return std::nullopt if can't read more data.
virtual std::optional<DataTypes> readRowAndGetDataTypes() = 0;
void setColumnNames(const std::vector<String> & names) { column_names = names; }
virtual bool allowVariableNumberOfColumns() const { return false; }
size_t field_index;
private:

View File

@ -284,7 +284,7 @@ bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out)
return true;
}
bool CSVFormatReader::allowVariableNumberOfColumns()
bool CSVFormatReader::allowVariableNumberOfColumns() const
{
return format_settings.csv.allow_variable_number_of_columns;
}
@ -410,19 +410,22 @@ CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_t
{
}
std::pair<std::vector<String>, DataTypes> CSVSchemaReader::readRowAndGetFieldsAndDataTypes()
std::optional<std::pair<std::vector<String>, DataTypes>> CSVSchemaReader::readRowAndGetFieldsAndDataTypes()
{
if (buf.eof())
return {};
auto fields = reader.readRow();
auto data_types = tryInferDataTypesByEscapingRule(fields, format_settings, FormatSettings::EscapingRule::CSV);
return {fields, data_types};
return std::make_pair(std::move(fields), std::move(data_types));
}
DataTypes CSVSchemaReader::readRowAndGetDataTypesImpl()
std::optional<DataTypes> CSVSchemaReader::readRowAndGetDataTypesImpl()
{
return std::move(readRowAndGetFieldsAndDataTypes().second);
auto fields_with_types = readRowAndGetFieldsAndDataTypes();
if (!fields_with_types)
return {};
return std::move(fields_with_types->second);
}

View File

@ -70,7 +70,7 @@ public:
void skipPrefixBeforeHeader() override;
bool checkForEndOfRow() override;
bool allowVariableNumberOfColumns() override;
bool allowVariableNumberOfColumns() const override;
std::vector<String> readNames() override { return readHeaderRow(); }
std::vector<String> readTypes() override { return readHeaderRow(); }
@ -102,8 +102,10 @@ public:
CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
private:
DataTypes readRowAndGetDataTypesImpl() override;
std::pair<std::vector<String>, DataTypes> readRowAndGetFieldsAndDataTypes() override;
bool allowVariableNumberOfColumns() const override { return format_settings.csv.allow_variable_number_of_columns; }
std::optional<DataTypes> readRowAndGetDataTypesImpl() override;
std::optional<std::pair<std::vector<String>, DataTypes>> readRowAndGetFieldsAndDataTypes() override;
PeekableReadBuffer buf;
CSVFormatReader reader;

Some files were not shown because too many files have changed in this diff Show More