diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug-report-or-unexpected-behaviour.md similarity index 82% rename from .github/ISSUE_TEMPLATE/bug_report.md rename to .github/ISSUE_TEMPLATE/bug-report-or-unexpected-behaviour.md index c3283d3d97d..9526b99b22b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug-report-or-unexpected-behaviour.md @@ -1,16 +1,16 @@ --- -name: Bug report +name: Bug report or unexpected behaviour about: Create a report to help us improve ClickHouse title: '' -labels: bug, issue +labels: bug assignees: '' --- (you don't have to strictly follow this form) -**Describe the bug** -A clear and concise description of what the bug is. +**Describe the bug or unexpected behaviour** +A clear and concise description of what works not as it is supposed to. **How to reproduce** * Which ClickHouse server version to use diff --git a/.github/ISSUE_TEMPLATE/documentation-issue.md b/.github/ISSUE_TEMPLATE/documentation-issue.md new file mode 100644 index 00000000000..a8f31eadc56 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation-issue.md @@ -0,0 +1,16 @@ +--- +name: Documentation issue +about: Report something incorrect or missing in documentation +title: '' +labels: documentation +assignees: BayoNet + +--- + +(you don't have to strictly follow this form) + +**Describe the issue** +A clear and concise description of what's wrong in documentation. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/performance-issue.md b/.github/ISSUE_TEMPLATE/performance-issue.md new file mode 100644 index 00000000000..402617d00f7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/performance-issue.md @@ -0,0 +1,27 @@ +--- +name: Performance issue +about: Report something working slower than expected +title: '' +labels: performance +assignees: '' + +--- + +(you don't have to strictly follow this form) + +**Describe the situation** +What exactly works slower than expected? + +**How to reproduce** +* Which ClickHouse server version to use +* Which interface to use, if matters +* Non-default settings, if any +* `CREATE TABLE` statements for all tables involved +* Sample data for all these tables, use [clickhouse-obfuscator](https://github.com/yandex/ClickHouse/blob/master/dbms/programs/obfuscator/Obfuscator.cpp#L42-L80) if necessary +* Queries to run that lead to slow performance + +**Expected performance** +What are your performance expectation, why do you think they are realistic? Has it been working faster in older ClickHouse releases? Is it working faster in some specific other system? + +**Additional context** +Add any other context about the problem here. diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 63a454d0ea6..992b672bccc 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -372,8 +372,8 @@ if (USE_PROTOBUF) endif () if (USE_HDFS) - target_link_libraries (clickhouse_common_io PRIVATE ${HDFS3_LIBRARY}) - target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${HDFS3_INCLUDE_DIR}) + target_link_libraries (clickhouse_common_io PUBLIC ${HDFS3_LIBRARY}) + target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${HDFS3_INCLUDE_DIR}) endif() if (USE_BROTLI) diff --git a/dbms/programs/client/Client.cpp b/dbms/programs/client/Client.cpp index df5e8568d21..854e226cb94 100644 --- a/dbms/programs/client/Client.cpp +++ b/dbms/programs/client/Client.cpp @@ -431,8 +431,14 @@ private: /// Load command history if present. if (config().has("history_file")) history_file = config().getString("history_file"); - else if (!home_path.empty()) - history_file = home_path + "/.clickhouse-client-history"; + else + { + auto history_file_from_env = getenv("CLICKHOUSE_HISTORY_FILE"); + if (history_file_from_env) + history_file = history_file_from_env; + else if (!home_path.empty()) + history_file = home_path + "/.clickhouse-client-history"; + } if (!history_file.empty()) { diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index fd058f1fbe0..9001c1e6801 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -449,6 +449,8 @@ namespace ErrorCodes extern const int READONLY_SETTING = 472; extern const int DEADLOCK_AVOIDED = 473; extern const int INVALID_TEMPLATE_FORMAT = 474; + extern const int INVALID_WITH_FILL_EXPRESSION = 475; + extern const int WITH_TIES_WITHOUT_ORDER_BY = 476; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Common/SharedBlockRowRef.h b/dbms/src/Common/SharedBlockRowRef.h new file mode 100644 index 00000000000..193f7e4dd05 --- /dev/null +++ b/dbms/src/Common/SharedBlockRowRef.h @@ -0,0 +1,89 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +/// Allows you refer to the row in the block and hold the block ownership, +/// and thus avoid creating a temporary row object. +/// Do not use std::shared_ptr, since there is no need for a place for `weak_count` and `deleter`; +/// does not use Poco::SharedPtr, since you need to allocate a block and `refcount` in one piece; +/// does not use Poco::AutoPtr, since it does not have a `move` constructor and there are extra checks for nullptr; +/// The reference counter is not atomic, since it is used from one thread. +namespace detail +{ + struct SharedBlock : Block + { + int refcount = 0; + + ColumnRawPtrs all_columns; + ColumnRawPtrs sort_columns; + + SharedBlock(Block && block) : Block(std::move(block)) {} + }; +} + +inline void intrusive_ptr_add_ref(detail::SharedBlock * ptr) +{ + ++ptr->refcount; +} + +inline void intrusive_ptr_release(detail::SharedBlock * ptr) +{ + if (0 == --ptr->refcount) + delete ptr; +} + +using SharedBlockPtr = boost::intrusive_ptr; + +struct SharedBlockRowRef +{ + ColumnRawPtrs * columns = nullptr; + size_t row_num; + SharedBlockPtr shared_block; + + void swap(SharedBlockRowRef & other) + { + std::swap(columns, other.columns); + std::swap(row_num, other.row_num); + std::swap(shared_block, other.shared_block); + } + + /// The number and types of columns must match. + bool operator==(const SharedBlockRowRef & other) const + { + size_t size = columns->size(); + for (size_t i = 0; i < size; ++i) + if (0 != (*columns)[i]->compareAt(row_num, other.row_num, *(*other.columns)[i], 1)) + return false; + return true; + } + + bool operator!=(const SharedBlockRowRef & other) const + { + return !(*this == other); + } + + void reset() + { + SharedBlockRowRef empty; + swap(empty); + } + + bool empty() const { return columns == nullptr; } + size_t size() const { return empty() ? 0 : columns->size(); } + + void set(SharedBlockPtr & shared_block_, ColumnRawPtrs * columns_, size_t row_num_) + { + shared_block = shared_block_; + columns = columns_; + row_num = row_num_; + } +}; + +} diff --git a/dbms/src/Common/parseGlobs.cpp b/dbms/src/Common/parseGlobs.cpp new file mode 100644 index 00000000000..b7595d4591c --- /dev/null +++ b/dbms/src/Common/parseGlobs.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include + +namespace DB +{ +/* Transforms string from grep-wildcard-syntax ("{N..M}", "{a,b,c}" as in remote table function and "*", "?") to perl-regexp for using re2 library fo matching + * with such steps: + * 1) search intervals like {0..9} and enums like {abc,xyz,qwe} in {}, replace them by regexp with pipe (expr1|expr2|expr3), + * 2) search and replace "*" and "?". + * Before each search need to escape symbols that we would not search. + * + * There are few examples in unit tests. + */ +std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_globs) +{ + std::ostringstream oss_for_escaping; + /// Escaping only characters that not used in glob syntax + for (const auto & letter : initial_str_with_globs) + { + if ((letter == '[') || (letter == ']') || (letter == '|') || (letter == '+') || (letter == '-') || (letter == '(') || (letter == ')')) + oss_for_escaping << '\\'; + oss_for_escaping << letter; + } + std::string escaped_with_globs = oss_for_escaping.str(); + + static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without {}*, + re2::StringPiece input(escaped_with_globs); + re2::StringPiece matched; + std::ostringstream oss_for_replacing; + size_t current_index = 0; + while (RE2::FindAndConsume(&input, enum_or_range, &matched)) + { + std::string buffer = matched.ToString(); + oss_for_replacing << escaped_with_globs.substr(current_index, matched.data() - escaped_with_globs.data() - current_index - 1) << '('; + + if (buffer.find(',') == std::string::npos) + { + size_t range_begin, range_end; + char point; + std::istringstream iss_range(buffer); + iss_range >> range_begin >> point >> point >> range_end; + oss_for_replacing << range_begin; + for (size_t i = range_begin + 1; i <= range_end; ++i) + { + oss_for_replacing << '|' << i; + } + } + else + { + std::replace(buffer.begin(), buffer.end(), ',', '|'); + oss_for_replacing << buffer; + } + oss_for_replacing << ")"; + current_index = input.data() - escaped_with_globs.data(); + } + oss_for_replacing << escaped_with_globs.substr(current_index); + std::string almost_res = oss_for_replacing.str(); + std::ostringstream oss_final_processing; + for (const auto & letter : almost_res) + { + if ((letter == '?') || (letter == '*')) + { + oss_final_processing << "[^/]"; /// '?' is any symbol except '/' + if (letter == '?') + continue; + } + if ((letter == '.') || (letter == '{') || (letter == '}')) + oss_final_processing << '\\'; + oss_final_processing << letter; + } + return oss_final_processing.str(); +} +} diff --git a/dbms/src/Common/parseGlobs.h b/dbms/src/Common/parseGlobs.h new file mode 100644 index 00000000000..24fff0c3d0a --- /dev/null +++ b/dbms/src/Common/parseGlobs.h @@ -0,0 +1,10 @@ +#pragma once +#include +#include + +namespace DB +{ +/* Parse globs in string and make a regexp for it. + */ +std::string makeRegexpPatternFromGlobs(const std::string & path); +} diff --git a/dbms/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp b/dbms/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp new file mode 100644 index 00000000000..db695b965a1 --- /dev/null +++ b/dbms/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp @@ -0,0 +1,20 @@ +#include +#include +#include + + +using namespace DB; + + +TEST(Common, makeRegexpPatternFromGlobs) +{ + EXPECT_EQ(makeRegexpPatternFromGlobs("f{01..09}"), "f(1|2|3|4|5|6|7|8|9)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{01..9}"), "f(1|2|3|4|5|6|7|8|9)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{0001..0000009}"), "f(1|2|3|4|5|6|7|8|9)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..2}{1..2}"), "f(1|2)(1|2)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..1}{1..1}"), "f(1)(1)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{0..0}{0..0}"), "f(0)(0)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("file{1..5}"),"file(1|2|3|4|5)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("file{1,2,3}"),"file(1|2|3)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("{1,2,3}blabla{a.x,b.x,c.x}smth[]_else{aa,bb}?*"), "(1|2|3)blabla(a\\.x|b\\.x|c\\.x)smth\\[\\]_else(aa|bb)[^/][^/]*"); +} diff --git a/dbms/src/Core/Names.h b/dbms/src/Core/Names.h index 61220779f7b..5489a233b6e 100644 --- a/dbms/src/Core/Names.h +++ b/dbms/src/Core/Names.h @@ -16,4 +16,7 @@ using NameOrderedSet = std::set; using NameToNameMap = std::unordered_map; using NameToNameSetMap = std::unordered_map; +using NameWithAlias = std::pair; +using NamesWithAliases = std::vector; + } diff --git a/dbms/src/Core/SortDescription.h b/dbms/src/Core/SortDescription.h index ebf3a401d9b..e1ec142f645 100644 --- a/dbms/src/Core/SortDescription.h +++ b/dbms/src/Core/SortDescription.h @@ -4,13 +4,22 @@ #include #include #include - +#include class Collator; namespace DB { +struct FillColumnDescription +{ + /// All missed values in range [FROM, TO) will be filled + /// Range [FROM, TO) respects sorting direction + Field fill_from; /// Fill value >= FILL_FROM + Field fill_to; /// Fill value + STEP < FILL_TO + Field fill_step; /// Default = 1 or -1 according to direction +}; + /// Description of the sorting rule by one column. struct SortColumnDescription { @@ -20,12 +29,23 @@ struct SortColumnDescription int nulls_direction; /// 1 - NULLs and NaNs are greater, -1 - less. /// To achieve NULLS LAST, set it equal to direction, to achieve NULLS FIRST, set it opposite. std::shared_ptr collator; /// Collator for locale-specific comparison of strings + bool with_fill; + FillColumnDescription fill_description; - SortColumnDescription(size_t column_number_, int direction_, int nulls_direction_, const std::shared_ptr & collator_ = nullptr) - : column_number(column_number_), direction(direction_), nulls_direction(nulls_direction_), collator(collator_) {} - SortColumnDescription(const std::string & column_name_, int direction_, int nulls_direction_, const std::shared_ptr & collator_ = nullptr) - : column_name(column_name_), column_number(0), direction(direction_), nulls_direction(nulls_direction_), collator(collator_) {} + SortColumnDescription( + size_t column_number_, int direction_, int nulls_direction_, + const std::shared_ptr & collator_ = nullptr, bool with_fill_ = false, + const FillColumnDescription & fill_description_ = {}) + : column_number(column_number_), direction(direction_), nulls_direction(nulls_direction_), collator(collator_) + , with_fill(with_fill_), fill_description(fill_description_) {} + + SortColumnDescription( + const std::string & column_name_, int direction_, int nulls_direction_, + const std::shared_ptr & collator_ = nullptr, bool with_fill_ = false, + const FillColumnDescription & fill_description_ = {}) + : column_name(column_name_), column_number(0), direction(direction_), nulls_direction(nulls_direction_) + , collator(collator_), with_fill(with_fill_), fill_description(fill_description_) {} bool operator == (const SortColumnDescription & other) const { diff --git a/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h b/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h index caa94ebbf15..6f9a0707e37 100644 --- a/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/AggregatingSortedBlockInputStream.h @@ -50,8 +50,8 @@ private: std::vector columns_to_aggregate; std::vector columns_to_simple_aggregate; - RowRef current_key; /// The current primary key. - RowRef next_key; /// The primary key of the next row. + SharedBlockRowRef current_key; /// The current primary key. + SharedBlockRowRef next_key; /// The primary key of the next row. /** We support two different cursors - with Collation and without. * Templates are used instead of polymorphic SortCursor and calls to virtual functions. diff --git a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h index 4b3b936d703..7e114e614f6 100644 --- a/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/CollapsingSortedBlockInputStream.h @@ -47,12 +47,12 @@ private: /// Read is finished. bool finished = false; - RowRef current_key; /// The current primary key. - RowRef next_key; /// The primary key of the next row. + SharedBlockRowRef current_key; /// The current primary key. + SharedBlockRowRef next_key; /// The primary key of the next row. - RowRef first_negative; /// The first negative row for the current primary key. - RowRef last_positive; /// The last positive row for the current primary key. - RowRef last_negative; /// Last negative row. It is only stored if there is not one row is written to output. + SharedBlockRowRef first_negative; /// The first negative row for the current primary key. + SharedBlockRowRef last_positive; /// The last positive row for the current primary key. + SharedBlockRowRef last_negative; /// Last negative row. It is only stored if there is not one row is written to output. size_t count_positive = 0; /// The number of positive rows for the current primary key. size_t count_negative = 0; /// The number of negative rows for the current primary key. diff --git a/dbms/src/DataStreams/FillingBlockInputStream.cpp b/dbms/src/DataStreams/FillingBlockInputStream.cpp new file mode 100644 index 00000000000..ec026d56ad0 --- /dev/null +++ b/dbms/src/DataStreams/FillingBlockInputStream.cpp @@ -0,0 +1,186 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_WITH_FILL_EXPRESSION; +} + +FillingBlockInputStream::FillingBlockInputStream( + const BlockInputStreamPtr & input, const SortDescription & sort_description_) + : sort_description(sort_description_), filling_row(sort_description_), next_row(sort_description_) +{ + children.push_back(input); + header = children.at(0)->getHeader(); + + std::vector is_fill_column(header.columns()); + for (const auto & elem : sort_description) + is_fill_column[header.getPositionByName(elem.column_name)] = true; + + auto try_convert_fields = [](FillColumnDescription & descr, const DataTypePtr & type) + { + auto max_type = Field::Types::Null; + WhichDataType which(type); + DataTypePtr to_type; + if (isInteger(type) || which.isDateOrDateTime()) + { + max_type = Field::Types::Int64; + to_type = std::make_shared(); + } + else if (which.isFloat()) + { + max_type = Field::Types::Float64; + to_type = std::make_shared(); + } + + if (descr.fill_from.getType() > max_type || descr.fill_to.getType() > max_type + || descr.fill_step.getType() > max_type) + return false; + descr.fill_from = convertFieldToType(descr.fill_from, *to_type); + descr.fill_to = convertFieldToType(descr.fill_to, *to_type); + descr.fill_step = convertFieldToType(descr.fill_step, *to_type); + + return true; + }; + + for (size_t i = 0; i < header.columns(); ++i) + { + if (is_fill_column[i]) + { + size_t pos = fill_column_positions.size(); + auto & descr = filling_row.getFillDescription(pos); + auto type = header.getByPosition(i).type; + if (!try_convert_fields(descr, type)) + throw Exception("Incompatible types of WITH FILL expression values with column type " + + type->getName(), ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + + if (type->isValueRepresentedByUnsignedInteger() && + ((!descr.fill_from.isNull() && less(descr.fill_from, Field{0}, 1)) || + (!descr.fill_to.isNull() && less(descr.fill_to, Field{0}, 1)))) + { + throw Exception("WITH FILL bound values cannot be negative for unsigned type " + + type->getName(), ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + } + + fill_column_positions.push_back(i); + } + else + other_column_positions.push_back(i); + } +} + + +Block FillingBlockInputStream::readImpl() +{ + Columns old_fill_columns; + Columns old_other_columns; + MutableColumns res_fill_columns; + MutableColumns res_other_columns; + + auto init_columns_by_positions = [](const Block & block, Columns & columns, + MutableColumns & mutable_columns, const Positions & positions) + { + for (size_t pos : positions) + { + auto column = block.getByPosition(pos).column; + columns.push_back(column); + mutable_columns.push_back(column->cloneEmpty()->assumeMutable()); + } + }; + + auto block = children.back()->read(); + if (!block) + { + init_columns_by_positions(header, old_fill_columns, res_fill_columns, fill_column_positions); + init_columns_by_positions(header, old_other_columns, res_other_columns, other_column_positions); + + bool should_insert_first = next_row < filling_row; + + bool generated = false; + for (size_t i = 0; i < filling_row.size(); ++i) + next_row[i] = filling_row.getFillDescription(i).fill_to; + + if (should_insert_first && filling_row < next_row) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + + while (filling_row.next(next_row)) + { + generated = true; + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + } + + if (generated) + return createResultBlock(res_fill_columns, res_other_columns); + + return block; + } + + size_t rows = block.rows(); + init_columns_by_positions(block, old_fill_columns, res_fill_columns, fill_column_positions); + init_columns_by_positions(block, old_other_columns, res_other_columns, other_column_positions); + + if (first) + { + for (size_t i = 0; i < filling_row.size(); ++i) + { + auto current_value = (*old_fill_columns[i])[0]; + const auto & fill_from = filling_row.getFillDescription(i).fill_from; + if (!fill_from.isNull() && !equals(current_value, fill_from)) + { + filling_row.initFromDefaults(i); + if (less(fill_from, current_value, filling_row.getDirection(i))) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + break; + } + filling_row[i] = current_value; + } + first = false; + } + + for (size_t row_ind = 0; row_ind < rows; ++row_ind) + { + bool should_insert_first = next_row < filling_row; + + for (size_t i = 0; i < filling_row.size(); ++i) + { + auto current_value = (*old_fill_columns[i])[row_ind]; + const auto & fill_to = filling_row.getFillDescription(i).fill_to; + + if (fill_to.isNull() || less(current_value, fill_to, filling_row.getDirection(i))) + next_row[i] = current_value; + else + next_row[i] = fill_to; + } + + /// A case, when at previous step row was initialized from defaults 'fill_from' values + /// and probably we need to insert it to block. + if (should_insert_first && filling_row < next_row) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + + /// Insert generated filling row to block, while it is less than current row in block. + while (filling_row.next(next_row)) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + + copyRowFromColumns(res_fill_columns, old_fill_columns, row_ind); + copyRowFromColumns(res_other_columns, old_other_columns, row_ind); + } + + return createResultBlock(res_fill_columns, res_other_columns); +} + +Block FillingBlockInputStream::createResultBlock(MutableColumns & fill_columns, MutableColumns & other_columns) const +{ + MutableColumns result_columns(header.columns()); + for (size_t i = 0; i < fill_columns.size(); ++i) + result_columns[fill_column_positions[i]] = std::move(fill_columns[i]); + for (size_t i = 0; i < other_columns.size(); ++i) + result_columns[other_column_positions[i]] = std::move(other_columns[i]); + + return header.cloneWithColumns(std::move(result_columns)); +} + +} diff --git a/dbms/src/DataStreams/FillingBlockInputStream.h b/dbms/src/DataStreams/FillingBlockInputStream.h new file mode 100644 index 00000000000..3cc4702e374 --- /dev/null +++ b/dbms/src/DataStreams/FillingBlockInputStream.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/** Implements modifier WITH FILL of ORDER BY clause. + * It fills gaps in data stream by rows with missing values in columns with set WITH FILL and deafults in other columns. + * Optionally FROM, TO and STEP values can be specified. + */ +class FillingBlockInputStream : public IBlockInputStream +{ +public: + FillingBlockInputStream(const BlockInputStreamPtr & input, const SortDescription & fill_description_); + + String getName() const override { return "Filling"; } + + Block getHeader() const override { return header; } + +protected: + Block readImpl() override; + +private: + Block createResultBlock(MutableColumns & fill_columns, MutableColumns & other_columns) const; + + const SortDescription sort_description; /// Contains only rows with WITH FILL. + FillingRow filling_row; /// Current row, which is used to fill gaps. + FillingRow next_row; /// Row to which we need to generate filling rows. + Block header; + + using Positions = std::vector; + Positions fill_column_positions; + Positions other_column_positions; + bool first = true; +}; + +} diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp index 456c43ca802..326141b0140 100644 --- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp +++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.cpp @@ -321,7 +321,7 @@ void GraphiteRollupSortedBlockInputStream::finishCurrentGroup(MutableColumns & m } -void GraphiteRollupSortedBlockInputStream::accumulateRow(RowRef & row) +void GraphiteRollupSortedBlockInputStream::accumulateRow(SharedBlockRowRef & row) { const Graphite::AggregationPattern * aggregation_pattern = std::get<1>(current_rule); if (aggregate_state_created) diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h index 560274f1dae..cfe4783af10 100644 --- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h +++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h @@ -204,7 +204,7 @@ private: StringRef current_group_path; /// Last row with maximum version for current primary key (time bucket). - RowRef current_subgroup_newest_row; + SharedBlockRowRef current_subgroup_newest_row; /// Time of last read row time_t current_time = 0; @@ -236,7 +236,7 @@ private: void finishCurrentGroup(MutableColumns & merged_columns); /// Update the state of the aggregate function with the new `value`. - void accumulateRow(RowRef & row); + void accumulateRow(SharedBlockRowRef & row); }; } diff --git a/dbms/src/DataStreams/LimitBlockInputStream.cpp b/dbms/src/DataStreams/LimitBlockInputStream.cpp index 1348a223000..5e262e921e8 100644 --- a/dbms/src/DataStreams/LimitBlockInputStream.cpp +++ b/dbms/src/DataStreams/LimitBlockInputStream.cpp @@ -6,8 +6,30 @@ namespace DB { -LimitBlockInputStream::LimitBlockInputStream(const BlockInputStreamPtr & input, UInt64 limit_, UInt64 offset_, bool always_read_till_end_, bool use_limit_as_total_rows_approx) - : limit(limit_), offset(offset_), always_read_till_end(always_read_till_end_) +/// gets pointers to all columns of block, which were used for ORDER BY +static ColumnRawPtrs extractSortColumns(const Block & block, const SortDescription & description) +{ + size_t size = description.size(); + ColumnRawPtrs res; + res.reserve(size); + + for (size_t i = 0; i < size; ++i) + { + const IColumn * column = !description[i].column_name.empty() + ? block.getByName(description[i].column_name).column.get() + : block.safeGetByPosition(description[i].column_number).column.get(); + res.emplace_back(column); + } + + return res; +} + + +LimitBlockInputStream::LimitBlockInputStream( + const BlockInputStreamPtr & input, UInt64 limit_, UInt64 offset_, bool always_read_till_end_, + bool use_limit_as_total_rows_approx, bool with_ties_, const SortDescription & description_) + : limit(limit_), offset(offset_), always_read_till_end(always_read_till_end_), with_ties(with_ties_) + , description(description_) { if (use_limit_as_total_rows_approx) { @@ -17,13 +39,45 @@ LimitBlockInputStream::LimitBlockInputStream(const BlockInputStreamPtr & input, children.push_back(input); } - Block LimitBlockInputStream::readImpl() { Block res; UInt64 rows = 0; - /// pos - how many rows were read, including the last read block + /// pos >= offset + limit and all rows in the end of previous block were equal + /// to row at 'limit' position. So we check current block. + if (!ties_row_ref.empty() && pos >= offset + limit) + { + res = children.back()->read(); + rows = res.rows(); + + if (!res) + return res; + + SharedBlockPtr ptr = new detail::SharedBlock(std::move(res)); + ptr->sort_columns = extractSortColumns(*ptr, description); + + UInt64 len; + for (len = 0; len < rows; ++len) + { + SharedBlockRowRef current_row; + current_row.set(ptr, &ptr->sort_columns, len); + + if (current_row != ties_row_ref) + { + ties_row_ref.reset(); + break; + } + } + + if (len < rows) + { + for (size_t i = 0; i < ptr->columns(); ++i) + ptr->safeGetByPosition(i).column = ptr->safeGetByPosition(i).column->cut(0, len); + } + + return *ptr; + } if (pos >= offset + limit) { @@ -46,9 +100,18 @@ Block LimitBlockInputStream::readImpl() pos += rows; } while (pos <= offset); - /// return the whole block + SharedBlockPtr ptr = new detail::SharedBlock(std::move(res)); + if (with_ties) + ptr->sort_columns = extractSortColumns(*ptr, description); + + /// give away the whole block if (pos >= offset + rows && pos <= offset + limit) - return res; + { + /// Save rowref for last row, because probalbly next block begins with the same row. + if (with_ties && pos == offset + limit) + ties_row_ref.set(ptr, &ptr->sort_columns, rows - 1); + return *ptr; + } /// give away a piece of the block UInt64 start = std::max( @@ -60,13 +123,36 @@ Block LimitBlockInputStream::readImpl() static_cast(pos) - static_cast(offset), static_cast(limit) + static_cast(offset) - static_cast(pos) + static_cast(rows))); - for (size_t i = 0; i < res.columns(); ++i) - res.getByPosition(i).column = res.getByPosition(i).column->cut(start, length); + + /// check if other rows in current block equals to last one in limit + if (with_ties) + { + ties_row_ref.set(ptr, &ptr->sort_columns, start + length - 1); + + for (size_t i = ties_row_ref.row_num + 1; i < rows; ++i) + { + SharedBlockRowRef current_row; + current_row.set(ptr, &ptr->sort_columns, i); + if (current_row == ties_row_ref) + ++length; + else + { + ties_row_ref.reset(); + break; + } + } + } + + if (length == rows) + return *ptr; + + for (size_t i = 0; i < ptr->columns(); ++i) + ptr->safeGetByPosition(i).column = ptr->safeGetByPosition(i).column->cut(start, length); // TODO: we should provide feedback to child-block, so it will know how many rows are actually consumed. // It's crucial for streaming engines like Kafka. - return res; + return *ptr; } } diff --git a/dbms/src/DataStreams/LimitBlockInputStream.h b/dbms/src/DataStreams/LimitBlockInputStream.h index ed6dac8c5ac..6c5f76cdaaf 100644 --- a/dbms/src/DataStreams/LimitBlockInputStream.h +++ b/dbms/src/DataStreams/LimitBlockInputStream.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB @@ -17,8 +18,13 @@ public: * If always_read_till_end = true - reads all the data to the end, but ignores them. This is necessary in rare cases: * when otherwise, due to the cancellation of the request, we would not have received the data for GROUP BY WITH TOTALS from the remote server. * If use_limit_as_total_rows_approx = true, then addTotalRowsApprox is called to use the limit in progress & stats + * with_ties = true, when query has WITH TIES modifier. If so, description should be provided + * description lets us know which row we should check for equality */ - LimitBlockInputStream(const BlockInputStreamPtr & input, UInt64 limit_, UInt64 offset_, bool always_read_till_end_ = false, bool use_limit_as_total_rows_approx = false); + LimitBlockInputStream( + const BlockInputStreamPtr & input, UInt64 limit_, UInt64 offset_, + bool always_read_till_end_ = false, bool use_limit_as_total_rows_approx = false, + bool with_ties_ = false, const SortDescription & description_ = {}); String getName() const override { return "Limit"; } @@ -32,6 +38,9 @@ private: UInt64 offset; UInt64 pos = 0; bool always_read_till_end; + bool with_ties; + const SortDescription description; + SharedBlockRowRef ties_row_ref; }; } diff --git a/dbms/src/DataStreams/MergingSortedBlockInputStream.h b/dbms/src/DataStreams/MergingSortedBlockInputStream.h index c519914f174..beb3c7afc52 100644 --- a/dbms/src/DataStreams/MergingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/MergingSortedBlockInputStream.h @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -24,39 +25,6 @@ namespace ErrorCodes } -/// Allows you refer to the row in the block and hold the block ownership, -/// and thus avoid creating a temporary row object. -/// Do not use std::shared_ptr, since there is no need for a place for `weak_count` and `deleter`; -/// does not use Poco::SharedPtr, since you need to allocate a block and `refcount` in one piece; -/// does not use Poco::AutoPtr, since it does not have a `move` constructor and there are extra checks for nullptr; -/// The reference counter is not atomic, since it is used from one thread. -namespace detail -{ -struct SharedBlock : Block -{ - int refcount = 0; - - ColumnRawPtrs all_columns; - ColumnRawPtrs sort_columns; - - SharedBlock(Block && block) : Block(std::move(block)) {} -}; -} - -using SharedBlockPtr = boost::intrusive_ptr; - -inline void intrusive_ptr_add_ref(detail::SharedBlock * ptr) -{ - ++ptr->refcount; -} - -inline void intrusive_ptr_release(detail::SharedBlock * ptr) -{ - if (0 == --ptr->refcount) - delete ptr; -} - - /** Merges several sorted streams into one sorted stream. */ class MergingSortedBlockInputStream : public IBlockInputStream @@ -78,44 +46,6 @@ public: Block getHeader() const override { return header; } protected: - struct RowRef - { - ColumnRawPtrs * columns = nullptr; - size_t row_num = 0; - SharedBlockPtr shared_block; - - void swap(RowRef & other) - { - std::swap(columns, other.columns); - std::swap(row_num, other.row_num); - std::swap(shared_block, other.shared_block); - } - - /// The number and types of columns must match. - bool operator==(const RowRef & other) const - { - size_t size = columns->size(); - for (size_t i = 0; i < size; ++i) - if (0 != (*columns)[i]->compareAt(row_num, other.row_num, *(*other.columns)[i], 1)) - return false; - return true; - } - - bool operator!=(const RowRef & other) const - { - return !(*this == other); - } - - void reset() - { - RowRef empty; - swap(empty); - } - - bool empty() const { return columns == nullptr; } - size_t size() const { return empty() ? 0 : columns->size(); } - }; - /// Simple class, which allows to check stop condition during merge process /// in simple case it just compare amount of merged rows with max_block_size /// in `count_average` case it compares amount of merged rows with linear combination @@ -148,7 +78,6 @@ protected: } }; - Block readImpl() override; void readSuffixImpl() override; @@ -230,7 +159,7 @@ protected: } template - void setRowRef(RowRef & row_ref, TSortCursor & cursor) + void setRowRef(SharedBlockRowRef & row_ref, TSortCursor & cursor) { row_ref.row_num = cursor.impl->pos; row_ref.shared_block = source_blocks[cursor.impl->order]; @@ -238,7 +167,7 @@ protected: } template - void setPrimaryKeyRef(RowRef & row_ref, TSortCursor & cursor) + void setPrimaryKeyRef(SharedBlockRowRef & row_ref, TSortCursor & cursor) { row_ref.row_num = cursor.impl->pos; row_ref.shared_block = source_blocks[cursor.impl->order]; diff --git a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h index 525c8a50754..7d85542520d 100644 --- a/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/ReplacingSortedBlockInputStream.h @@ -41,11 +41,11 @@ private: bool finished = false; /// Primary key of current row. - RowRef current_key; + SharedBlockRowRef current_key; /// Primary key of next row. - RowRef next_key; + SharedBlockRowRef next_key; /// Last row with maximum version for current primary key. - RowRef selected_row; + SharedBlockRowRef selected_row; /// The position (into current_row_sources) of the row with the highest version. size_t max_pos = 0; diff --git a/dbms/src/DataStreams/SummingSortedBlockInputStream.h b/dbms/src/DataStreams/SummingSortedBlockInputStream.h index 96f417eb113..4412e5529f8 100644 --- a/dbms/src/DataStreams/SummingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/SummingSortedBlockInputStream.h @@ -129,8 +129,8 @@ private: std::vector columns_to_aggregate; std::vector maps_to_sum; - RowRef current_key; /// The current primary key. - RowRef next_key; /// The primary key of the next row. + SharedBlockRowRef current_key; /// The current primary key. + SharedBlockRowRef next_key; /// The primary key of the next row. Row current_row; bool current_row_is_zero = true; /// Are all summed columns zero (or empty)? It is updated incrementally. diff --git a/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.cpp b/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.cpp index ee6690f2f0d..4dda97597bd 100644 --- a/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.cpp +++ b/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.cpp @@ -47,7 +47,7 @@ void VersionedCollapsingSortedBlockInputStream::insertGap(size_t gap_size) } } -void VersionedCollapsingSortedBlockInputStream::insertRow(size_t skip_rows, const RowRef & row, MutableColumns & merged_columns) +void VersionedCollapsingSortedBlockInputStream::insertRow(size_t skip_rows, const SharedBlockRowRef & row, MutableColumns & merged_columns) { const auto & columns = row.shared_block->all_columns; for (size_t i = 0; i < num_columns; ++i) @@ -111,7 +111,7 @@ void VersionedCollapsingSortedBlockInputStream::merge(MutableColumns & merged_co SortCursor current = queue.top(); size_t current_block_granularity = current->rows; - RowRef next_key; + SharedBlockRowRef next_key; Int8 sign = assert_cast(*current->all_columns[sign_column_number]).getData()[current->pos]; diff --git a/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.h b/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.h index d3da6cffd09..f79b564063d 100644 --- a/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/VersionedCollapsingSortedBlockInputStream.h @@ -197,7 +197,7 @@ private: Int8 sign_in_queue = 0; const size_t max_rows_in_queue; /// Rows with the same primary key and sign. - FixedSizeDequeWithGaps current_keys; + FixedSizeDequeWithGaps current_keys; size_t blocks_written = 0; @@ -207,7 +207,7 @@ private: void merge(MutableColumns & merged_columns, std::priority_queue & queue); /// Output to result row for the current primary key. - void insertRow(size_t skip_rows, const RowRef & row, MutableColumns & merged_columns); + void insertRow(size_t skip_rows, const SharedBlockRowRef & row, MutableColumns & merged_columns); void insertGap(size_t gap_size); }; diff --git a/dbms/src/Functions/array/arrayElement.cpp b/dbms/src/Functions/array/arrayElement.cpp index 59f275692aa..d7132d0fe23 100644 --- a/dbms/src/Functions/array/arrayElement.cpp +++ b/dbms/src/Functions/array/arrayElement.cpp @@ -858,7 +858,7 @@ void FunctionArrayElement::perform(Block & block, const ColumnNumbers & argument builder.initSink(input_rows_count); if (index == 0u) - throw Exception("Array indices is 1-based", ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX); + throw Exception("Array indices are 1-based", ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX); if (!(executeNumberConst(block, arguments, result, index, builder) || executeNumberConst(block, arguments, result, index, builder) diff --git a/dbms/src/IO/HDFSCommon.cpp b/dbms/src/IO/HDFSCommon.cpp index a94fbeabd60..1c9980105f0 100644 --- a/dbms/src/IO/HDFSCommon.cpp +++ b/dbms/src/IO/HDFSCommon.cpp @@ -1,4 +1,5 @@ #include +#include #if USE_HDFS #include @@ -11,8 +12,9 @@ extern const int BAD_ARGUMENTS; extern const int NETWORK_ERROR; } -HDFSBuilderPtr createHDFSBuilder(const Poco::URI & uri) +HDFSBuilderPtr createHDFSBuilder(const std::string & uri_str) { + const Poco::URI uri(uri_str); auto & host = uri.getHost(); auto port = uri.getPort(); auto & path = uri.getPath(); diff --git a/dbms/src/IO/HDFSCommon.h b/dbms/src/IO/HDFSCommon.h index 8c526d908bb..c84990dfea1 100644 --- a/dbms/src/IO/HDFSCommon.h +++ b/dbms/src/IO/HDFSCommon.h @@ -1,7 +1,6 @@ #include #include #include -#include #if USE_HDFS #include @@ -27,12 +26,32 @@ struct HDFSFsDeleter } +struct HDFSFileInfo +{ + hdfsFileInfo * file_info; + int length; + + HDFSFileInfo() + : file_info(nullptr) + , length(0) + { + } + HDFSFileInfo(const HDFSFileInfo & other) = delete; + HDFSFileInfo(HDFSFileInfo && other) = default; + HDFSFileInfo & operator=(const HDFSFileInfo & other) = delete; + HDFSFileInfo & operator=(HDFSFileInfo && other) = default; + + ~HDFSFileInfo() + { + hdfsFreeFileInfo(file_info, length); + } +}; using HDFSBuilderPtr = std::unique_ptr; using HDFSFSPtr = std::unique_ptr, detail::HDFSFsDeleter>; // set read/connect timeout, default value in libhdfs3 is about 1 hour, and too large /// TODO Allow to tune from query Settings. -HDFSBuilderPtr createHDFSBuilder(const Poco::URI & hdfs_uri); +HDFSBuilderPtr createHDFSBuilder(const std::string & hdfs_uri); HDFSFSPtr createHDFSFS(hdfsBuilder * builder); } #endif diff --git a/dbms/src/IO/ReadBufferFromHDFS.cpp b/dbms/src/IO/ReadBufferFromHDFS.cpp index 9c44048d4ce..48409683799 100644 --- a/dbms/src/IO/ReadBufferFromHDFS.cpp +++ b/dbms/src/IO/ReadBufferFromHDFS.cpp @@ -2,7 +2,6 @@ #if USE_HDFS #include -#include #include @@ -16,7 +15,7 @@ namespace ErrorCodes struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl { - Poco::URI hdfs_uri; + std::string hdfs_uri; hdfsFile fin; HDFSBuilderPtr builder; HDFSFSPtr fs; @@ -26,8 +25,8 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl , builder(createHDFSBuilder(hdfs_uri)) , fs(createHDFSFS(builder.get())) { - - auto & path = hdfs_uri.getPath(); + const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2); + const std::string path = hdfs_uri.substr(begin_of_path); fin = hdfsOpenFile(fs.get(), path.c_str(), O_RDONLY, 0, 0, 0); if (fin == nullptr) @@ -39,7 +38,7 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl { int bytes_read = hdfsRead(fs.get(), fin, start, size); if (bytes_read < 0) - throw Exception("Fail to read HDFS file: " + hdfs_uri.toString() + " " + std::string(hdfsGetLastError()), + throw Exception("Fail to read HDFS file: " + hdfs_uri + " " + std::string(hdfsGetLastError()), ErrorCodes::NETWORK_ERROR); return bytes_read; } diff --git a/dbms/src/IO/WriteBufferFromHDFS.cpp b/dbms/src/IO/WriteBufferFromHDFS.cpp index 698c58bc000..2cd83ee6479 100644 --- a/dbms/src/IO/WriteBufferFromHDFS.cpp +++ b/dbms/src/IO/WriteBufferFromHDFS.cpp @@ -2,7 +2,6 @@ #if USE_HDFS -#include #include #include #include @@ -21,7 +20,7 @@ extern const int CANNOT_FSYNC; struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl { - Poco::URI hdfs_uri; + std::string hdfs_uri; hdfsFile fout; HDFSBuilderPtr builder; HDFSFSPtr fs; @@ -31,7 +30,11 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl , builder(createHDFSBuilder(hdfs_uri)) , fs(createHDFSFS(builder.get())) { - auto & path = hdfs_uri.getPath(); + const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2); + const std::string path = hdfs_uri.substr(begin_of_path); + if (path.find("*?{") != std::string::npos) + throw Exception("URI '" + hdfs_uri + "' contains globs, so the table is in readonly mode", ErrorCodes::CANNOT_OPEN_FILE); + fout = hdfsOpenFile(fs.get(), path.c_str(), O_WRONLY, 0, 0, 0); if (fout == nullptr) @@ -52,7 +55,7 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl { int bytes_written = hdfsWrite(fs.get(), fout, start, size); if (bytes_written < 0) - throw Exception("Fail to write HDFS file: " + hdfs_uri.toString() + " " + std::string(hdfsGetLastError()), + throw Exception("Fail to write HDFS file: " + hdfs_uri + " " + std::string(hdfsGetLastError()), ErrorCodes::NETWORK_ERROR); return bytes_written; } @@ -61,7 +64,7 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl { int result = hdfsSync(fs.get(), fout); if (result < 0) - throwFromErrno("Cannot HDFS sync" + hdfs_uri.toString() + " " + std::string(hdfsGetLastError()), + throwFromErrno("Cannot HDFS sync" + hdfs_uri + " " + std::string(hdfsGetLastError()), ErrorCodes::CANNOT_FSYNC); } }; diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index f60afe81276..7deb21d0dcc 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -93,14 +93,14 @@ NameSet AnalyzedJoin::getOriginalColumnsSet() const return out; } -std::unordered_map AnalyzedJoin::getOriginalColumnsMap(const NameSet & required_columns) const +NamesWithAliases AnalyzedJoin::getNamesWithAliases(const NameSet & required_columns) const { - std::unordered_map out; + NamesWithAliases out; for (const auto & column : required_columns) { auto it = original_names.find(column); if (it != original_names.end()) - out.insert(*it); + out.emplace_back(it->second, it->first); /// {original_name, name} } return out; } @@ -129,15 +129,15 @@ Names AnalyzedJoin::requiredJoinedNames() const return Names(required_columns_set.begin(), required_columns_set.end()); } -void AnalyzedJoin::appendRequiredColumns(const Block & sample, NameSet & required_columns) const +NamesWithAliases AnalyzedJoin::getRequiredColumns(const Block & sample, const Names & action_required_columns) const { - for (auto & column : key_names_right) + NameSet required_columns(action_required_columns.begin(), action_required_columns.end()); + + for (auto & column : requiredJoinedNames()) if (!sample.has(column)) required_columns.insert(column); - for (auto & column : columns_added_by_join) - if (!sample.has(column.name)) - required_columns.insert(column.name); + return getNamesWithAliases(required_columns); } void AnalyzedJoin::addJoinedColumn(const NameAndTypePair & joined_column) diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index 2622f35a941..bea430de479 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -64,12 +64,12 @@ public: NameSet getQualifiedColumnsSet() const; NameSet getOriginalColumnsSet() const; - std::unordered_map getOriginalColumnsMap(const NameSet & required_columns) const; + NamesWithAliases getNamesWithAliases(const NameSet & required_columns) const; + NamesWithAliases getRequiredColumns(const Block & sample, const Names & action_columns) const; void deduplicateAndQualifyColumnNames(const NameSet & left_table_columns, const String & right_table_prefix); size_t rightKeyInclusion(const String & name) const; - void appendRequiredColumns(const Block & sample, NameSet & required_columns) const; void addJoinedColumn(const NameAndTypePair & joined_column); void addJoinedColumnsAndCorrectNullability(Block & sample_block) const; diff --git a/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp b/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp index 4b0a76cfb37..ec29fbf40c1 100644 --- a/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp +++ b/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp @@ -1,6 +1,5 @@ #include #include -#include /// for getNamesAndTypeListFromTableExpression #include #include @@ -15,6 +14,8 @@ namespace DB NameSet removeDuplicateColumns(NamesAndTypesList & columns); +struct ASTTableExpression; +NamesAndTypesList getNamesAndTypeListFromTableExpression(const ASTTableExpression & table_expression, const Context & context); DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTIdentifier & identifier, const String & current_database) { diff --git a/dbms/src/Interpreters/ExpressionActions.h b/dbms/src/Interpreters/ExpressionActions.h index 6997c3ef759..20acd1a95c8 100644 --- a/dbms/src/Interpreters/ExpressionActions.h +++ b/dbms/src/Interpreters/ExpressionActions.h @@ -20,9 +20,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -using NameWithAlias = std::pair; -using NamesWithAliases = std::vector; - class AnalyzedJoin; class IPreparedFunction; diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index d82169cf8e4..9777e3d508d 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -417,8 +418,7 @@ bool SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, b if (!ast_join) return false; - SubqueryForSet & subquery_for_set = getSubqueryForJoin(*ast_join); - syntax->analyzed_join->setHashJoin(subquery_for_set.join); + makeTableJoin(*ast_join); initChain(chain, sourceColumns()); ExpressionActionsChain::Step & step = chain.steps.back(); @@ -456,7 +456,15 @@ static JoinPtr tryGetStorageJoin(const ASTTablesInSelectQueryElement & join_elem return {}; } -SubqueryForSet & SelectQueryExpressionAnalyzer::getSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element) +static ExpressionActionsPtr createJoinedBlockActions(const Context & context, const AnalyzedJoin & analyzed_join) +{ + ASTPtr expression_list = analyzed_join.rightKeysList(); + auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, + analyzed_join.columnsFromJoinedTable(), analyzed_join.requiredJoinedNames()); + return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false); +} + +void SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element) { /// Two JOINs are not supported with the same subquery, but different USINGs. auto join_hash = join_element.getTreeHash(); @@ -469,64 +477,45 @@ SubqueryForSet & SelectQueryExpressionAnalyzer::getSubqueryForJoin(const ASTTabl subquery_for_set.join = tryGetStorageJoin(join_element, context); if (!subquery_for_set.join) - makeHashJoin(join_element, subquery_for_set); + { + /// Actions which need to be calculated on joined block. + ExpressionActionsPtr joined_block_actions = createJoinedBlockActions(context, analyzedJoin()); - return subquery_for_set; + if (!subquery_for_set.source) + makeSubqueryForJoin(join_element, joined_block_actions, subquery_for_set); + + /// Test actions on sample block (early error detection) + Block sample_block = subquery_for_set.renamedSampleBlock(); + joined_block_actions->execute(sample_block); + + /// TODO You do not need to set this up when JOIN is only needed on remote servers. + subquery_for_set.join = analyzedJoin().makeHashJoin(sample_block, settings.size_limits_for_join); + subquery_for_set.joined_block_actions = joined_block_actions; + } + + syntax->analyzed_join->setHashJoin(subquery_for_set.join); } -void SelectQueryExpressionAnalyzer::makeHashJoin(const ASTTablesInSelectQueryElement & join_element, - SubqueryForSet & subquery_for_set) const +void SelectQueryExpressionAnalyzer::makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, + const ExpressionActionsPtr & joined_block_actions, + SubqueryForSet & subquery_for_set) const { - /// Actions which need to be calculated on joined block. - ExpressionActionsPtr joined_block_actions = createJoinedBlockActions(); - /** For GLOBAL JOINs (in the case, for example, of the push method for executing GLOBAL subqueries), the following occurs * - in the addExternalStorage function, the JOIN (SELECT ...) subquery is replaced with JOIN _data1, * in the subquery_for_set object this subquery is exposed as source and the temporary table _data1 as the `table`. * - this function shows the expression JOIN _data1. */ - if (!subquery_for_set.source) - { - ASTPtr table; - auto & table_to_join = join_element.table_expression->as(); - if (table_to_join.subquery) - table = table_to_join.subquery; - else if (table_to_join.table_function) - table = table_to_join.table_function; - else if (table_to_join.database_and_table_name) - table = table_to_join.database_and_table_name; + NamesWithAliases required_columns_with_aliases = + analyzedJoin().getRequiredColumns(joined_block_actions->getSampleBlock(), joined_block_actions->getRequiredColumns()); - Names action_columns = joined_block_actions->getRequiredColumns(); - NameSet required_columns(action_columns.begin(), action_columns.end()); + Names original_columns; + for (auto & pr : required_columns_with_aliases) + original_columns.push_back(pr.first); - analyzedJoin().appendRequiredColumns(joined_block_actions->getSampleBlock(), required_columns); + auto interpreter = interpretSubquery(join_element.table_expression, context, subquery_depth, original_columns); - auto original_map = analyzedJoin().getOriginalColumnsMap(required_columns); - Names original_columns; - for (auto & pr : original_map) - original_columns.push_back(pr.second); - - auto interpreter = interpretSubquery(table, context, subquery_depth, original_columns); - - subquery_for_set.makeSource(interpreter, original_map); - } - - Block sample_block = subquery_for_set.renamedSampleBlock(); - joined_block_actions->execute(sample_block); - - /// TODO You do not need to set this up when JOIN is only needed on remote servers. - subquery_for_set.join = analyzedJoin().makeHashJoin(sample_block, settings.size_limits_for_join); - subquery_for_set.joined_block_actions = joined_block_actions; -} - -ExpressionActionsPtr SelectQueryExpressionAnalyzer::createJoinedBlockActions() const -{ - ASTPtr expression_list = analyzedJoin().rightKeysList(); - Names required_columns = analyzedJoin().requiredJoinedNames(); - - auto syntax_result = SyntaxAnalyzer(context).analyze(expression_list, analyzedJoin().columnsFromJoinedTable(), required_columns); - return ExpressionAnalyzer(expression_list, syntax_result, context).getActions(true, false); + subquery_for_set.makeSource(interpreter, std::move(required_columns_with_aliases)); } bool SelectQueryExpressionAnalyzer::appendPrewhere( diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.h b/dbms/src/Interpreters/ExpressionAnalyzer.h index aebbaf038cc..9356046aee3 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.h +++ b/dbms/src/Interpreters/ExpressionAnalyzer.h @@ -26,9 +26,6 @@ class ASTExpressionList; class ASTSelectQuery; struct ASTTablesInSelectQueryElement; -struct SyntaxAnalyzerResult; -using SyntaxAnalyzerResultPtr = std::shared_ptr; - /// ExpressionAnalyzer sources, intermediates and results. It splits data and logic, allows to test them separately. struct ExpressionAnalyzerData { @@ -222,9 +219,9 @@ private: */ void tryMakeSetForIndexFromSubquery(const ASTPtr & subquery_or_table_name); - SubqueryForSet & getSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element); - ExpressionActionsPtr createJoinedBlockActions() const; - void makeHashJoin(const ASTTablesInSelectQueryElement & join_element, SubqueryForSet & subquery_for_set) const; + void makeTableJoin(const ASTTablesInSelectQueryElement & join_element); + void makeSubqueryForJoin(const ASTTablesInSelectQueryElement & join_element, const ExpressionActionsPtr & joined_block_actions, + SubqueryForSet & subquery_for_set) const; const ASTSelectQuery * getAggregatingQuery() const; }; diff --git a/dbms/src/Interpreters/FillingRow.cpp b/dbms/src/Interpreters/FillingRow.cpp new file mode 100644 index 00000000000..9d4c81dc70b --- /dev/null +++ b/dbms/src/Interpreters/FillingRow.cpp @@ -0,0 +1,127 @@ +#include + +namespace DB +{ + +bool less(const Field & lhs, const Field & rhs, int direction) +{ + if (direction == -1) + return applyVisitor(FieldVisitorAccurateLess(), rhs, lhs); + + return applyVisitor(FieldVisitorAccurateLess(), lhs, rhs); +} + +bool equals(const Field & lhs, const Field & rhs) +{ + return applyVisitor(FieldVisitorAccurateEquals(), lhs, rhs); +} + + +FillingRow::FillingRow(const SortDescription & description_) : description(description_) +{ + row.resize(description.size()); +} + +bool FillingRow::operator<(const FillingRow & other) const +{ + for (size_t i = 0; i < size(); ++i) + { + if (row[i].isNull() || other[i].isNull() || equals(row[i], other[i])) + continue; + return less(row[i], other[i], getDirection(i)); + } + return false; +} + +bool FillingRow::operator==(const FillingRow & other) const +{ + for (size_t i = 0; i < size(); ++i) + if (!equals(row[i], other[i])) + return false; + return true; +} + +bool FillingRow::next(const FillingRow & to_row) +{ + size_t pos = 0; + + /// Find position we need to increment for generating next row. + for (; pos < row.size(); ++pos) + if (!row[pos].isNull() && !to_row[pos].isNull() && !equals(row[pos], to_row[pos])) + break; + + if (pos == row.size() || less(to_row[pos], row[pos], getDirection(pos))) + return false; + + /// If we have any 'fill_to' value at position greater than 'pos', + /// we need to generate rows up to 'fill_to' value. + for (size_t i = row.size() - 1; i > pos; --i) + { + if (getFillDescription(i).fill_to.isNull() || row[i].isNull()) + continue; + + auto next_value = row[i]; + applyVisitor(FieldVisitorSum(getFillDescription(i).fill_step), next_value); + if (less(next_value, getFillDescription(i).fill_to, getDirection(i))) + { + row[i] = next_value; + initFromDefaults(i + 1); + return true; + } + } + + auto next_value = row[pos]; + applyVisitor(FieldVisitorSum(getFillDescription(pos).fill_step), next_value); + + if (less(to_row[pos], next_value, getDirection(pos))) + return false; + + row[pos] = next_value; + if (equals(row[pos], to_row[pos])) + { + bool is_less = false; + for (size_t i = pos + 1; i < size(); ++i) + { + const auto & fill_from = getFillDescription(i).fill_from; + if (!fill_from.isNull()) + row[i] = fill_from; + else + row[i] = to_row[i]; + is_less |= less(row[i], to_row[i], getDirection(i)); + } + + return is_less; + } + + initFromDefaults(pos + 1); + return true; +} + +void FillingRow::initFromDefaults(size_t from_pos) +{ + for (size_t i = from_pos; i < row.size(); ++i) + row[i] = getFillDescription(i).fill_from; +} + + +void insertFromFillingRow(MutableColumns & filling_columns, MutableColumns & other_columns, const FillingRow & filling_row) +{ + for (size_t i = 0; i < filling_columns.size(); ++i) + { + if (filling_row[i].isNull()) + filling_columns[i]->insertDefault(); + else + filling_columns[i]->insert(filling_row[i]); + } + + for (size_t i = 0; i < other_columns.size(); ++i) + other_columns[i]->insertDefault(); +} + +void copyRowFromColumns(MutableColumns & dest, const Columns & source, size_t row_num) +{ + for (size_t i = 0; i < source.size(); ++i) + dest[i]->insertFrom(*source[i], row_num); +} + +} diff --git a/dbms/src/Interpreters/FillingRow.h b/dbms/src/Interpreters/FillingRow.h new file mode 100644 index 00000000000..1753508e139 --- /dev/null +++ b/dbms/src/Interpreters/FillingRow.h @@ -0,0 +1,44 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +/// Compares fields in terms of sorting order, considering direction. +bool less(const Field & lhs, const Field & rhs, int direction); +bool equals(const Field & lhs, const Field & rhs); + +/** Helps to implement modifier WITH FILL for ORDER BY clause. + * Stores row as array of fields and provides functions to generate next row for filling gaps and for comparing rows. + * Used in FillingBlockInputStream and in FillingTransform. + */ +class FillingRow +{ +public: + FillingRow(const SortDescription & sort_description); + + /// Generates next row according to fill 'from', 'to' and 'step' values. + bool next(const FillingRow & to_row); + + void initFromDefaults(size_t from_pos = 0); + + Field & operator[](size_t ind) { return row[ind]; } + const Field & operator[](size_t ind) const { return row[ind]; } + size_t size() const { return row.size(); } + bool operator<(const FillingRow & other) const; + bool operator==(const FillingRow & other) const; + + int getDirection(size_t ind) const { return description[ind].direction; } + FillColumnDescription & getFillDescription(size_t ind) { return description[ind].fill_description; } + +private: + std::vector row; + SortDescription description; +}; + +void insertFromFillingRow(MutableColumns & filling_columns, MutableColumns & other_columns, const FillingRow & filling_row); +void copyRowFromColumns(MutableColumns & dest, const Columns & source, size_t row_num); + +} diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index e1470f33ca2..05ac99196a4 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,7 @@ #include #include #include +#include #include #include @@ -57,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +107,7 @@ namespace ErrorCodes extern const int PARAMETER_OUT_OF_BOUND; extern const int ARGUMENT_OUT_OF_BOUND; extern const int INVALID_LIMIT_EXPRESSION; + extern const int INVALID_WITH_FILL_EXPRESSION; } namespace @@ -681,8 +686,62 @@ InterpreterSelectQuery::analyzeExpressions( return res; } +static Field getWithFillFieldValue(const ASTPtr & node, const Context & context) +{ + const auto & [field, type] = evaluateConstantExpression(node, context); -static SortDescription getSortDescription(const ASTSelectQuery & query) + if (!isColumnedAsNumber(type)) + throw Exception("Illegal type " + type->getName() + " of WITH FILL expression, must be numeric type", ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + + return field; +} + +static FillColumnDescription getWithFillDescription(const ASTOrderByElement & order_by_elem, const Context & context) +{ + FillColumnDescription descr; + if (order_by_elem.fill_from) + descr.fill_from = getWithFillFieldValue(order_by_elem.fill_from, context); + if (order_by_elem.fill_to) + descr.fill_to = getWithFillFieldValue(order_by_elem.fill_to, context); + if (order_by_elem.fill_step) + descr.fill_step = getWithFillFieldValue(order_by_elem.fill_step, context); + else + descr.fill_step = order_by_elem.direction; + + if (applyVisitor(FieldVisitorAccurateEquals(), descr.fill_step, Field{0})) + throw Exception("WITH FILL STEP value cannot be zero", ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + + if (order_by_elem.direction == 1) + { + if (applyVisitor(FieldVisitorAccurateLess(), descr.fill_step, Field{0})) + throw Exception("WITH FILL STEP value cannot be negative for sorting in ascending direction", + ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + + if (!descr.fill_from.isNull() && !descr.fill_to.isNull() && + applyVisitor(FieldVisitorAccurateLess(), descr.fill_to, descr.fill_from)) + { + throw Exception("WITH FILL TO value cannot be less than FROM value for sorting in ascending direction", + ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + } + } + else + { + if (applyVisitor(FieldVisitorAccurateLess(), Field{0}, descr.fill_step)) + throw Exception("WITH FILL STEP value cannot be positive for sorting in descending direction", + ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + + if (!descr.fill_from.isNull() && !descr.fill_to.isNull() && + applyVisitor(FieldVisitorAccurateLess(), descr.fill_from, descr.fill_to)) + { + throw Exception("WITH FILL FROM value cannot be less than TO value for sorting in descending direction", + ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + } + } + + return descr; +} + +static SortDescription getSortDescription(const ASTSelectQuery & query, const Context & context) { SortDescription order_descr; order_descr.reserve(query.orderBy()->children.size()); @@ -695,13 +754,19 @@ static SortDescription getSortDescription(const ASTSelectQuery & query) if (order_by_elem.collation) collator = std::make_shared(order_by_elem.collation->as().value.get()); - order_descr.emplace_back(name, order_by_elem.direction, order_by_elem.nulls_direction, collator); + if (order_by_elem.with_fill) + { + FillColumnDescription fill_desc = getWithFillDescription(order_by_elem, context); + order_descr.emplace_back(name, order_by_elem.direction, + order_by_elem.nulls_direction, collator, true, fill_desc); + } + else + order_descr.emplace_back(name, order_by_elem.direction, order_by_elem.nulls_direction, collator); } return order_descr; } - static UInt64 getLimitUIntValue(const ASTPtr & node, const Context & context) { const auto & [field, type] = evaluateConstantExpression(node, context); @@ -736,7 +801,7 @@ static std::pair getLimitLengthAndOffset(const ASTSelectQuery & static UInt64 getLimitForSorting(const ASTSelectQuery & query, const Context & context) { /// Partial sort can be done if there is LIMIT but no DISTINCT or LIMIT BY. - if (!query.distinct && !query.limitBy()) + if (!query.distinct && !query.limitBy() && !query.limit_with_ties) { auto [limit_length, limit_offset] = getLimitLengthAndOffset(query, context); return limit_length + limit_offset; @@ -751,7 +816,7 @@ static SortingInfoPtr optimizeReadInOrder(const MergeTreeData & merge_tree, cons if (!merge_tree.hasSortingKey()) return {}; - auto order_descr = getSortDescription(query); + auto order_descr = getSortDescription(query, context); SortDescription prefix_order_descr; int read_direction = order_descr.at(0).direction; @@ -926,6 +991,21 @@ void InterpreterSelectQuery::executeImpl(TPipeline & pipeline, const BlockInputS pipeline.streams.back() = std::make_shared( pipeline.streams.back(), expressions.prewhere_info->prewhere_actions, expressions.prewhere_info->prewhere_column_name, expressions.prewhere_info->remove_prewhere_column); + + // To remove additional columns in dry run + // For example, sample column which can be removed in this stage + if (expressions.prewhere_info->remove_columns_actions) + { + if constexpr (pipeline_with_processors) + { + pipeline.addSimpleTransform([&](const Block & header) + { + return std::make_shared(header, expressions.prewhere_info->remove_columns_actions); + }); + } + else + pipeline.streams.back() = std::make_shared(pipeline.streams.back(), expressions.prewhere_info->remove_columns_actions); + } } } else @@ -1173,7 +1253,7 @@ void InterpreterSelectQuery::executeImpl(TPipeline & pipeline, const BlockInputS /** Optimization - if there are several sources and there is LIMIT, then first apply the preliminary LIMIT, * limiting the number of rows in each up to `offset + limit`. */ - if (query.limitLength() && pipeline.hasMoreThanOneStream() && !query.distinct && !expressions.has_limit_by && !settings.extremes) + if (query.limitLength() && !query.limit_with_ties && pipeline.hasMoreThanOneStream() && !query.distinct && !expressions.has_limit_by && !settings.extremes) { executePreLimit(pipeline); } @@ -1206,6 +1286,8 @@ void InterpreterSelectQuery::executeImpl(TPipeline & pipeline, const BlockInputS executeLimitBy(pipeline); } + executeWithFill(pipeline); + /** We must do projection after DISTINCT because projection may remove some columns. */ executeProjection(pipeline, expressions.final_projection); @@ -1222,7 +1304,6 @@ void InterpreterSelectQuery::executeImpl(TPipeline & pipeline, const BlockInputS executeSubqueriesInSetsAndJoins(pipeline, expressions.subqueries_for_sets); } - template void InterpreterSelectQuery::executeFetchColumns( QueryProcessingStage::Enum processing_stage, TPipeline & pipeline, @@ -1420,11 +1501,12 @@ void InterpreterSelectQuery::executeFetchColumns( auto [limit_length, limit_offset] = getLimitLengthAndOffset(query, context); - /** Optimization - if not specified DISTINCT, WHERE, GROUP, HAVING, ORDER, LIMIT BY but LIMIT is specified, and limit + offset < max_block_size, + /** Optimization - if not specified DISTINCT, WHERE, GROUP, HAVING, ORDER, LIMIT BY, WITH TIES but LIMIT is specified, and limit + offset < max_block_size, * then as the block size we will use limit + offset (not to read more from the table than requested), * and also set the number of threads to 1. */ if (!query.distinct + && !query.limit_with_ties && !query.prewhere() && !query.where() && !query.groupBy() @@ -1495,12 +1577,22 @@ void InterpreterSelectQuery::executeFetchColumns( streams = {std::make_shared(storage->getSampleBlockForColumns(required_columns))}; if (query_info.prewhere_info) + { streams.back() = std::make_shared( streams.back(), prewhere_info->prewhere_actions, prewhere_info->prewhere_column_name, prewhere_info->remove_prewhere_column); + // To remove additional columns + // In some cases, we did not read any marks so that the pipeline.streams is empty + // Thus, some columns in prewhere are not removed as expected + // This leads to mismatched header in distributed table + if (query_info.prewhere_info->remove_columns_actions) + { + streams.back() = std::make_shared(streams.back(), query_info.prewhere_info->remove_columns_actions); + } + } } for (auto & stream : streams) @@ -2007,7 +2099,7 @@ void InterpreterSelectQuery::executeExpression(QueryPipeline & pipeline, const E void InterpreterSelectQuery::executeOrder(Pipeline & pipeline, SortingInfoPtr sorting_info) { auto & query = getSelectQuery(); - SortDescription order_descr = getSortDescription(query); + SortDescription order_descr = getSortDescription(query, context); const Settings & settings = context.getSettingsRef(); UInt64 limit = getLimitForSorting(query, context); @@ -2079,7 +2171,7 @@ void InterpreterSelectQuery::executeOrder(Pipeline & pipeline, SortingInfoPtr so void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, SortingInfoPtr sorting_info) { auto & query = getSelectQuery(); - SortDescription order_descr = getSortDescription(query); + SortDescription order_descr = getSortDescription(query, context); UInt64 limit = getLimitForSorting(query, context); const Settings & settings = context.getSettingsRef(); @@ -2160,7 +2252,7 @@ void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, SortingInfoP void InterpreterSelectQuery::executeMergeSorted(Pipeline & pipeline) { auto & query = getSelectQuery(); - SortDescription order_descr = getSortDescription(query); + SortDescription order_descr = getSortDescription(query, context); UInt64 limit = getLimitForSorting(query, context); const Settings & settings = context.getSettingsRef(); @@ -2187,7 +2279,7 @@ void InterpreterSelectQuery::executeMergeSorted(Pipeline & pipeline) void InterpreterSelectQuery::executeMergeSorted(QueryPipeline & pipeline) { auto & query = getSelectQuery(); - SortDescription order_descr = getSortDescription(query); + SortDescription order_descr = getSortDescription(query, context); UInt64 limit = getLimitForSorting(query, context); const Settings & settings = context.getSettingsRef(); @@ -2234,7 +2326,7 @@ void InterpreterSelectQuery::executeDistinct(Pipeline & pipeline, bool before_or UInt64 limit_for_distinct = 0; /// If after this stage of DISTINCT ORDER BY is not executed, then you can get no more than limit_length + limit_offset of different rows. - if (!query.orderBy() || !before_order) + if ((!query.orderBy() || !before_order) && !query.limit_with_ties) limit_for_distinct = limit_length + limit_offset; pipeline.transform([&](auto & stream) @@ -2303,9 +2395,16 @@ void InterpreterSelectQuery::executePreLimit(Pipeline & pipeline) if (query.limitLength()) { auto [limit_length, limit_offset] = getLimitLengthAndOffset(query, context); + SortDescription sort_descr; + if (query.limit_with_ties) + { + if (!query.orderBy()) + throw Exception("LIMIT WITH TIES without ORDER BY", ErrorCodes::LOGICAL_ERROR); + sort_descr = getSortDescription(query, context); + } pipeline.transform([&, limit = limit_length + limit_offset](auto & stream) { - stream = std::make_shared(stream, limit, 0, false); + stream = std::make_shared(stream, limit, 0, false, false, query.limit_with_ties, sort_descr); }); } } @@ -2417,17 +2516,73 @@ void InterpreterSelectQuery::executeLimit(Pipeline & pipeline) if (!query.group_by_with_totals && hasWithTotalsInAnySubqueryInFromClause(query)) always_read_till_end = true; + SortDescription order_descr; + if (query.limit_with_ties) + { + if (!query.orderBy()) + throw Exception("LIMIT WITH TIES without ORDER BY", ErrorCodes::LOGICAL_ERROR); + order_descr = getSortDescription(query, context); + } + UInt64 limit_length; UInt64 limit_offset; std::tie(limit_length, limit_offset) = getLimitLengthAndOffset(query, context); pipeline.transform([&](auto & stream) { - stream = std::make_shared(stream, limit_length, limit_offset, always_read_till_end); + stream = std::make_shared(stream, limit_length, limit_offset, always_read_till_end, false, query.limit_with_ties, order_descr); }); } } + +void InterpreterSelectQuery::executeWithFill(Pipeline & pipeline) +{ + auto & query = getSelectQuery(); + if (query.orderBy()) + { + SortDescription order_descr = getSortDescription(query, context); + SortDescription fill_descr; + for (auto & desc : order_descr) + { + if (desc.with_fill) + fill_descr.push_back(desc); + } + + if (fill_descr.empty()) + return; + + pipeline.transform([&](auto & stream) + { + stream = std::make_shared(stream, fill_descr); + }); + } +} + +void InterpreterSelectQuery::executeWithFill(QueryPipeline & pipeline) +{ + auto & query = getSelectQuery(); + if (query.orderBy()) + { + SortDescription order_descr = getSortDescription(query, context); + SortDescription fill_descr; + for (auto & desc : order_descr) + { + if (desc.with_fill) + fill_descr.push_back(desc); + } + + if (fill_descr.empty()) + return; + + pipeline.addSimpleTransform([&](const Block & header) + { + return std::make_shared(header, fill_descr); + }); + } +} + + void InterpreterSelectQuery::executeLimit(QueryPipeline & pipeline) { auto & query = getSelectQuery(); @@ -2455,13 +2610,21 @@ void InterpreterSelectQuery::executeLimit(QueryPipeline & pipeline) UInt64 limit_offset; std::tie(limit_length, limit_offset) = getLimitLengthAndOffset(query, context); + SortDescription order_descr; + if (query.limit_with_ties) + { + if (!query.orderBy()) + throw Exception("LIMIT WITH TIES without ORDER BY", ErrorCodes::LOGICAL_ERROR); + order_descr = getSortDescription(query, context); + } + pipeline.addSimpleTransform([&](const Block & header, QueryPipeline::StreamType stream_type) -> ProcessorPtr { if (stream_type != QueryPipeline::StreamType::Main) return nullptr; return std::make_shared( - header, limit_length, limit_offset, always_read_till_end); + header, limit_length, limit_offset, always_read_till_end, query.limit_with_ties, order_descr); }); } } diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.h b/dbms/src/Interpreters/InterpreterSelectQuery.h index 3a441445c9b..b1707db3480 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.h +++ b/dbms/src/Interpreters/InterpreterSelectQuery.h @@ -204,6 +204,7 @@ private: void executeHaving(Pipeline & pipeline, const ExpressionActionsPtr & expression); void executeExpression(Pipeline & pipeline, const ExpressionActionsPtr & expression); void executeOrder(Pipeline & pipeline, SortingInfoPtr sorting_info); + void executeWithFill(Pipeline & pipeline); void executeMergeSorted(Pipeline & pipeline); void executePreLimit(Pipeline & pipeline); void executeUnion(Pipeline & pipeline, Block header); /// If header is not empty, convert streams structure to it. @@ -221,6 +222,7 @@ private: void executeHaving(QueryPipeline & pipeline, const ExpressionActionsPtr & expression); void executeExpression(QueryPipeline & pipeline, const ExpressionActionsPtr & expression); void executeOrder(QueryPipeline & pipeline, SortingInfoPtr sorting_info); + void executeWithFill(QueryPipeline & pipeline); void executeMergeSorted(QueryPipeline & pipeline); void executePreLimit(QueryPipeline & pipeline); void executeLimitBy(QueryPipeline & pipeline); diff --git a/dbms/src/Interpreters/QueryNormalizer.cpp b/dbms/src/Interpreters/QueryNormalizer.cpp index ffa94f3d700..c2991885cf3 100644 --- a/dbms/src/Interpreters/QueryNormalizer.cpp +++ b/dbms/src/Interpreters/QueryNormalizer.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include diff --git a/dbms/src/Interpreters/SubqueryForSet.cpp b/dbms/src/Interpreters/SubqueryForSet.cpp index f6528bf110c..6e0cd540db4 100644 --- a/dbms/src/Interpreters/SubqueryForSet.cpp +++ b/dbms/src/Interpreters/SubqueryForSet.cpp @@ -1,5 +1,4 @@ #include -#include #include #include @@ -7,26 +6,14 @@ namespace DB { void SubqueryForSet::makeSource(std::shared_ptr & interpreter, - const std::unordered_map & name_to_origin) + NamesWithAliases && joined_block_aliases_) { + joined_block_aliases = std::move(joined_block_aliases_); source = std::make_shared(interpreter->getSampleBlock(), [interpreter]() mutable { return interpreter->execute().in; }); - for (const auto & names : name_to_origin) - joined_block_aliases.emplace_back(names.second, names.first); - sample_block = source->getHeader(); - for (const auto & name_with_alias : joined_block_aliases) - { - if (sample_block.has(name_with_alias.first)) - { - auto pos = sample_block.getPositionByName(name_with_alias.first); - auto column = sample_block.getByPosition(pos); - sample_block.erase(pos); - column.name = name_with_alias.second; - sample_block.insert(std::move(column)); - } - } + renameColumns(sample_block); } void SubqueryForSet::renameColumns(Block & block) diff --git a/dbms/src/Interpreters/SubqueryForSet.h b/dbms/src/Interpreters/SubqueryForSet.h index 79d32d836c6..abba7a4ec2f 100644 --- a/dbms/src/Interpreters/SubqueryForSet.h +++ b/dbms/src/Interpreters/SubqueryForSet.h @@ -31,7 +31,7 @@ struct SubqueryForSet StoragePtr table; void makeSource(std::shared_ptr & interpreter, - const std::unordered_map & name_to_origin); + NamesWithAliases && joined_block_aliases_); Block renamedSampleBlock() const { return sample_block; } void renameColumns(Block & block); diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.cpp b/dbms/src/Interpreters/SyntaxAnalyzer.cpp index 3419a5baba3..dd0c37c50b5 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp +++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include /// getSmallestColumn() #include diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.h b/dbms/src/Interpreters/SyntaxAnalyzer.h index a2187ee2ef0..44fdc61ded3 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.h +++ b/dbms/src/Interpreters/SyntaxAnalyzer.h @@ -1,7 +1,7 @@ #pragma once +#include #include -#include #include #include @@ -11,6 +11,9 @@ namespace DB NameSet removeDuplicateColumns(NamesAndTypesList & columns); class ASTFunction; +class AnalyzedJoin; +class Context; +struct SelectQueryOptions; struct SyntaxAnalyzerResult { diff --git a/dbms/src/Interpreters/interpretSubquery.cpp b/dbms/src/Interpreters/interpretSubquery.cpp index 76b570ecdb9..82545d4b3be 100644 --- a/dbms/src/Interpreters/interpretSubquery.cpp +++ b/dbms/src/Interpreters/interpretSubquery.cpp @@ -18,6 +18,19 @@ namespace DB std::shared_ptr interpretSubquery( const ASTPtr & table_expression, const Context & context, size_t subquery_depth, const Names & required_source_columns) { + if (auto * expr = table_expression->as()) + { + ASTPtr table; + if (expr->subquery) + table = expr->subquery; + else if (expr->table_function) + table = expr->table_function; + else if (expr->database_and_table_name) + table = expr->database_and_table_name; + + return interpretSubquery(table, context, subquery_depth, required_source_columns); + } + /// Subquery or table name. The name of the table is similar to the subquery `SELECT * FROM t`. const auto * subquery = table_expression->as(); const auto * function = table_expression->as(); diff --git a/dbms/src/Parsers/ASTOrderByElement.cpp b/dbms/src/Parsers/ASTOrderByElement.cpp index 3ec5674ab27..ac57cfa437b 100644 --- a/dbms/src/Parsers/ASTOrderByElement.cpp +++ b/dbms/src/Parsers/ASTOrderByElement.cpp @@ -25,6 +25,26 @@ void ASTOrderByElement::formatImpl(const FormatSettings & settings, FormatState settings.ostr << (settings.hilite ? hilite_keyword : "") << " COLLATE " << (settings.hilite ? hilite_none : ""); collation->formatImpl(settings, state, frame); } + + if (with_fill) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " WITH FILL " << (settings.hilite ? hilite_none : ""); + if (fill_from) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM " << (settings.hilite ? hilite_none : ""); + fill_from->formatImpl(settings, state, frame); + } + if (fill_to) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " TO " << (settings.hilite ? hilite_none : ""); + fill_to->formatImpl(settings, state, frame); + } + if (fill_step) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " STEP " << (settings.hilite ? hilite_none : ""); + fill_step->formatImpl(settings, state, frame); + } + } } } diff --git a/dbms/src/Parsers/ASTOrderByElement.h b/dbms/src/Parsers/ASTOrderByElement.h index 729915400ce..30da8172f52 100644 --- a/dbms/src/Parsers/ASTOrderByElement.h +++ b/dbms/src/Parsers/ASTOrderByElement.h @@ -18,12 +18,22 @@ public: /** Collation for locale-specific string comparison. If empty, then sorting done by bytes. */ ASTPtr collation; + bool with_fill; + ASTPtr fill_from; + ASTPtr fill_to; + ASTPtr fill_step; + ASTOrderByElement( - const int direction_, const int nulls_direction_, const bool nulls_direction_was_explicitly_specified_, ASTPtr & collation_) + const int direction_, const int nulls_direction_, const bool nulls_direction_was_explicitly_specified_, + ASTPtr & collation_, const bool with_fill_, ASTPtr & fill_from_, ASTPtr & fill_to_, ASTPtr & fill_step_) : direction(direction_) , nulls_direction(nulls_direction_) , nulls_direction_was_explicitly_specified(nulls_direction_was_explicitly_specified_) , collation(collation_) + , with_fill(with_fill_) + , fill_from(fill_from_) + , fill_to(fill_to_) + , fill_step(fill_step_) { } diff --git a/dbms/src/Parsers/ASTSelectQuery.cpp b/dbms/src/Parsers/ASTSelectQuery.cpp index 16396095ce9..b06d786a5f9 100644 --- a/dbms/src/Parsers/ASTSelectQuery.cpp +++ b/dbms/src/Parsers/ASTSelectQuery.cpp @@ -148,6 +148,8 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F s.ostr << ", "; } limitLength()->formatImpl(s, state, frame); + if (limit_with_ties) + s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << " WITH TIES" << (s.hilite ? hilite_none : ""); } if (settings()) diff --git a/dbms/src/Parsers/ASTSelectQuery.h b/dbms/src/Parsers/ASTSelectQuery.h index b94800ee0be..38ba12b88a2 100644 --- a/dbms/src/Parsers/ASTSelectQuery.h +++ b/dbms/src/Parsers/ASTSelectQuery.h @@ -42,6 +42,7 @@ public: bool group_by_with_totals = false; bool group_by_with_rollup = false; bool group_by_with_cube = false; + bool limit_with_ties = false; ASTPtr & refSelect() { return getExpression(Expression::SELECT); } ASTPtr & refTables() { return getExpression(Expression::TABLES); } diff --git a/dbms/src/Parsers/ExpressionElementParsers.cpp b/dbms/src/Parsers/ExpressionElementParsers.cpp index eddbe2abb2f..8f0f7fffce2 100644 --- a/dbms/src/Parsers/ExpressionElementParsers.cpp +++ b/dbms/src/Parsers/ExpressionElementParsers.cpp @@ -1360,7 +1360,12 @@ bool ParserOrderByElement::parseImpl(Pos & pos, ASTPtr & node, Expected & expect ParserKeyword first("FIRST"); ParserKeyword last("LAST"); ParserKeyword collate("COLLATE"); + ParserKeyword with_fill("WITH FILL"); + ParserKeyword from("FROM"); + ParserKeyword to("TO"); + ParserKeyword step("STEP"); ParserStringLiteral collate_locale_parser; + ParserExpressionWithOptionalAlias exp_parser(false); ASTPtr expr_elem; if (!elem_p.parse(pos, expr_elem, expected)) @@ -1395,7 +1400,27 @@ bool ParserOrderByElement::parseImpl(Pos & pos, ASTPtr & node, Expected & expect return false; } - node = std::make_shared(direction, nulls_direction, nulls_direction_was_explicitly_specified, locale_node); + /// WITH FILL [FROM x] [TO y] [STEP z] + bool has_with_fill = false; + ASTPtr fill_from; + ASTPtr fill_to; + ASTPtr fill_step; + if (with_fill.ignore(pos)) + { + has_with_fill = true; + if (from.ignore(pos) && !exp_parser.parse(pos, fill_from, expected)) + return false; + + if (to.ignore(pos) && !exp_parser.parse(pos, fill_to, expected)) + return false; + + if (step.ignore(pos) && !exp_parser.parse(pos, fill_step, expected)) + return false; + } + + node = std::make_shared( + direction, nulls_direction, nulls_direction_was_explicitly_specified, locale_node, + has_with_fill, fill_from, fill_to, fill_step); node->children.push_back(expr_elem); if (locale_node) node->children.push_back(locale_node); diff --git a/dbms/src/Parsers/ExpressionElementParsers.h b/dbms/src/Parsers/ExpressionElementParsers.h index 9a87a78a5a3..31ab4b0189a 100644 --- a/dbms/src/Parsers/ExpressionElementParsers.h +++ b/dbms/src/Parsers/ExpressionElementParsers.h @@ -289,6 +289,7 @@ protected: /** Element of ORDER BY expression - same as expression element, but in addition, ASC[ENDING] | DESC[ENDING] could be specified * and optionally, NULLS LAST|FIRST * and optionally, COLLATE 'locale'. + * and optionally, WITH FILL [FROM x] [TO y] [STEP z] */ class ParserOrderByElement : public IParserBase { diff --git a/dbms/src/Parsers/ParserSelectQuery.cpp b/dbms/src/Parsers/ParserSelectQuery.cpp index afef7842ef6..e65ef9d3c92 100644 --- a/dbms/src/Parsers/ParserSelectQuery.cpp +++ b/dbms/src/Parsers/ParserSelectQuery.cpp @@ -17,6 +17,7 @@ namespace ErrorCodes { extern const int SYNTAX_ERROR; extern const int TOP_AND_LIMIT_TOGETHER; + extern const int WITH_TIES_WITHOUT_ORDER_BY; } @@ -41,6 +42,7 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserKeyword s_rollup("ROLLUP"); ParserKeyword s_cube("CUBE"); ParserKeyword s_top("TOP"); + ParserKeyword s_with_ties("WITH TIES"); ParserKeyword s_offset("OFFSET"); ParserNotEmptyExpressionList exp_list(false); @@ -76,7 +78,7 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) } } - /// SELECT [DISTINCT] [TOP N] expr list + /// SELECT [DISTINCT] [TOP N [WITH TIES]] expr list { if (!s_select.ignore(pos, expected)) return false; @@ -100,6 +102,9 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!num.parse(pos, limit_length, expected)) return false; } + + if (s_with_ties.ignore(pos, expected)) + select_query->limit_with_ties = true; } if (!exp_list_for_select_clause.parse(pos, select_expression_list, expected)) @@ -197,12 +202,18 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) limit_offset = limit_length; if (!exp_elem.parse(pos, limit_length, expected)) return false; + + if (s_with_ties.ignore(pos, expected)) + select_query->limit_with_ties = true; } else if (s_offset.ignore(pos, expected)) { if (!exp_elem.parse(pos, limit_offset, expected)) return false; } + else if (s_with_ties.ignore(pos, expected)) + select_query->limit_with_ties = true; + if (s_by.ignore(pos, expected)) { limit_by_length = limit_length; @@ -215,7 +226,7 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) } } - /// LIMIT length | LIMIT offset, length + /// LIMIT length [WITH TIES] | LIMIT offset, length [WITH TIES] if (s_limit.ignore(pos, expected)) { if (!limit_by_length|| limit_length) @@ -237,8 +248,15 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!exp_elem.parse(pos, limit_offset, expected)) return false; } + + if (s_with_ties.ignore(pos, expected)) + select_query->limit_with_ties = true; } + /// WITH TIES was used without ORDER BY + if (!order_expression_list && select_query->limit_with_ties) + throw Exception("Can not use WITH TIES without ORDER BY", ErrorCodes::WITH_TIES_WITHOUT_ORDER_BY); + /// SETTINGS key1 = value1, key2 = value2, ... if (s_settings.ignore(pos, expected)) { diff --git a/dbms/src/Processors/LimitTransform.cpp b/dbms/src/Processors/LimitTransform.cpp index 1be10c405bb..266267d4e56 100644 --- a/dbms/src/Processors/LimitTransform.cpp +++ b/dbms/src/Processors/LimitTransform.cpp @@ -6,19 +6,26 @@ namespace DB LimitTransform::LimitTransform( const Block & header_, size_t limit_, size_t offset_, - bool always_read_till_end_) + bool always_read_till_end_, bool with_ties_, + const SortDescription & description_) : IProcessor({header_}, {header_}) , input(inputs.front()), output(outputs.front()) , limit(limit_), offset(offset_) , always_read_till_end(always_read_till_end_) + , with_ties(with_ties_), description(description_) { + for (const auto & desc : description) + { + if (!desc.column_name.empty()) + sort_column_positions.push_back(header_.getPositionByName(desc.column_name)); + else + sort_column_positions.push_back(desc.column_number); + } } LimitTransform::Status LimitTransform::prepare() { - - /// Check can output. bool output_finished = false; if (output.isFinished()) @@ -46,7 +53,7 @@ LimitTransform::Status LimitTransform::prepare() } /// Check if we are done with pushing. - bool pushing_is_finished = rows_read >= offset + limit; + bool pushing_is_finished = (rows_read >= offset + limit) && ties_row_ref.empty(); if (pushing_is_finished) { if (!always_read_till_end) @@ -116,6 +123,13 @@ LimitTransform::Status LimitTransform::prepare() if (output.hasData()) return Status::PortFull; + if (with_ties && rows_read == offset + limit) + { + SharedChunkPtr shared_chunk = new detail::SharedChunk(current_chunk.clone()); + shared_chunk->sort_columns = extractSortColumns(shared_chunk->getColumns()); + ties_row_ref.set(shared_chunk, &shared_chunk->sort_columns, shared_chunk->getNumRows() - 1); + } + output.push(std::move(current_chunk)); has_block = false; @@ -132,8 +146,39 @@ LimitTransform::Status LimitTransform::prepare() void LimitTransform::work() { - size_t num_rows = current_chunk.getNumRows(); - size_t num_columns = current_chunk.getNumColumns(); + SharedChunkPtr shared_chunk = new detail::SharedChunk(std::move(current_chunk)); + shared_chunk->sort_columns = extractSortColumns(shared_chunk->getColumns()); + + size_t num_rows = shared_chunk->getNumRows(); + size_t num_columns = shared_chunk->getNumColumns(); + + if (!ties_row_ref.empty() && rows_read >= offset + limit) + { + UInt64 len; + for (len = 0; len < num_rows; ++len) + { + SharedChunkRowRef current_row; + current_row.set(shared_chunk, &shared_chunk->sort_columns, len); + + if (current_row != ties_row_ref) + { + ties_row_ref.reset(); + break; + } + } + + auto columns = shared_chunk->detachColumns(); + + if (len < num_rows) + { + for (size_t i = 0; i < num_columns; ++i) + columns[i] = columns[i]->cut(0, len); + } + + current_chunk.setColumns(std::move(columns), len); + block_processed = true; + return; + } /// return a piece of the block size_t start = std::max( @@ -145,7 +190,33 @@ void LimitTransform::work() static_cast(rows_read) - static_cast(offset), static_cast(limit) + static_cast(offset) - static_cast(rows_read) + static_cast(num_rows))); - auto columns = current_chunk.detachColumns(); + /// check if other rows in current block equals to last one in limit + if (with_ties) + { + ties_row_ref.set(shared_chunk, &shared_chunk->sort_columns, start + length - 1); + SharedChunkRowRef current_row; + + for (size_t i = ties_row_ref.row_num + 1; i < num_rows; ++i) + { + current_row.set(shared_chunk, &shared_chunk->sort_columns, i); + if (current_row == ties_row_ref) + ++length; + else + { + ties_row_ref.reset(); + break; + } + } + } + + if (length == num_rows) + { + current_chunk = std::move(*shared_chunk); + block_processed = true; + return; + } + + auto columns = shared_chunk->detachColumns(); for (size_t i = 0; i < num_columns; ++i) columns[i] = columns[i]->cut(start, length); @@ -155,5 +226,15 @@ void LimitTransform::work() block_processed = true; } +ColumnRawPtrs LimitTransform::extractSortColumns(const Columns & columns) +{ + ColumnRawPtrs res; + res.reserve(description.size()); + for (size_t pos : sort_column_positions) + res.push_back(columns[pos].get()); + + return res; +} + } diff --git a/dbms/src/Processors/LimitTransform.h b/dbms/src/Processors/LimitTransform.h index f80ca263c95..3df5e3cc049 100644 --- a/dbms/src/Processors/LimitTransform.h +++ b/dbms/src/Processors/LimitTransform.h @@ -1,7 +1,8 @@ #pragma once #include - +#include +#include namespace DB { @@ -23,10 +24,18 @@ private: UInt64 rows_before_limit_at_least = 0; + bool with_ties; + const SortDescription description; + SharedChunkRowRef ties_row_ref; + + std::vector sort_column_positions; + ColumnRawPtrs extractSortColumns(const Columns & columns); + public: LimitTransform( const Block & header_, size_t limit_, size_t offset_, - bool always_read_till_end_ = false); + bool always_read_till_end_ = false, bool with_ties_ = false, + const SortDescription & description_ = {}); String getName() const override { return "Limit"; } diff --git a/dbms/src/Processors/SharedChunk.h b/dbms/src/Processors/SharedChunk.h new file mode 100644 index 00000000000..c6fe3c12f89 --- /dev/null +++ b/dbms/src/Processors/SharedChunk.h @@ -0,0 +1,91 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +/// Allows you refer to the row in the block and hold the block ownership, +/// and thus avoid creating a temporary row object. +/// Do not use std::shared_ptr, since there is no need for a place for `weak_count` and `deleter`; +/// does not use Poco::SharedPtr, since you need to allocate a block and `refcount` in one piece; +/// does not use Poco::AutoPtr, since it does not have a `move` constructor and there are extra checks for nullptr; +/// The reference counter is not atomic, since it is used from one thread. +namespace detail +{ +struct SharedChunk : Chunk +{ + int refcount = 0; + + ColumnRawPtrs all_columns; + ColumnRawPtrs sort_columns; + + SharedChunk(Chunk && chunk) : Chunk(std::move(chunk)) {} +}; + +} + +inline void intrusive_ptr_add_ref(detail::SharedChunk * ptr) +{ + ++ptr->refcount; +} + +inline void intrusive_ptr_release(detail::SharedChunk * ptr) +{ + if (0 == --ptr->refcount) + delete ptr; +} + +using SharedChunkPtr = boost::intrusive_ptr; + + +struct SharedChunkRowRef +{ + ColumnRawPtrs * columns = nullptr; + size_t row_num; + SharedChunkPtr shared_block; + + void swap(SharedChunkRowRef & other) + { + std::swap(columns, other.columns); + std::swap(row_num, other.row_num); + std::swap(shared_block, other.shared_block); + } + + /// The number and types of columns must match. + bool operator==(const SharedChunkRowRef & other) const + { + size_t size = columns->size(); + for (size_t i = 0; i < size; ++i) + if (0 != (*columns)[i]->compareAt(row_num, other.row_num, *(*other.columns)[i], 1)) + return false; + return true; + } + + bool operator!=(const SharedChunkRowRef & other) const + { + return !(*this == other); + } + + void reset() + { + SharedChunkRowRef empty; + swap(empty); + } + + bool empty() const { return columns == nullptr; } + size_t size() const { return empty() ? 0 : columns->size(); } + + void set(SharedChunkPtr & shared_block_, ColumnRawPtrs * columns_, size_t row_num_) + { + shared_block = shared_block_; + columns = columns_; + row_num = row_num_; + } +}; + +} diff --git a/dbms/src/Processors/Transforms/FillingTransform.cpp b/dbms/src/Processors/Transforms/FillingTransform.cpp new file mode 100644 index 00000000000..50fac121819 --- /dev/null +++ b/dbms/src/Processors/Transforms/FillingTransform.cpp @@ -0,0 +1,201 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_WITH_FILL_EXPRESSION; +} + + +FillingTransform::FillingTransform( + const Block & header_, const SortDescription & sort_description_) + : ISimpleTransform(header_, header_, true) + , sort_description(sort_description_) + , filling_row(sort_description_) + , next_row(sort_description_) +{ + std::vector is_fill_column(header_.columns()); + for (const auto & elem : sort_description) + is_fill_column[header_.getPositionByName(elem.column_name)] = true; + + auto try_convert_fields = [](FillColumnDescription & descr, const DataTypePtr & type) + { + auto max_type = Field::Types::Null; + WhichDataType which(type); + DataTypePtr to_type; + if (isInteger(type) || which.isDateOrDateTime()) + { + max_type = Field::Types::Int64; + to_type = std::make_shared(); + } + else if (which.isFloat()) + { + max_type = Field::Types::Float64; + to_type = std::make_shared(); + } + + if (descr.fill_from.getType() > max_type || descr.fill_to.getType() > max_type + || descr.fill_step.getType() > max_type) + return false; + + descr.fill_from = convertFieldToType(descr.fill_from, *to_type); + descr.fill_to = convertFieldToType(descr.fill_to, *to_type); + descr.fill_step = convertFieldToType(descr.fill_step, *to_type); + + return true; + }; + + for (size_t i = 0; i < header_.columns(); ++i) + { + if (is_fill_column[i]) + { + size_t pos = fill_column_positions.size(); + auto & descr = filling_row.getFillDescription(pos); + auto type = header_.getByPosition(i).type; + if (!try_convert_fields(descr, type)) + throw Exception("Incompatible types of WITH FILL expression values with column type " + + type->getName(), ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + + if (type->isValueRepresentedByUnsignedInteger() && + ((!descr.fill_from.isNull() && less(descr.fill_from, Field{0}, 1)) || + (!descr.fill_to.isNull() && less(descr.fill_to, Field{0}, 1)))) + { + throw Exception("WITH FILL bound values cannot be negative for unsigned type " + + type->getName(), ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + } + + fill_column_positions.push_back(i); + } + else + other_column_positions.push_back(i); + } +} + +IProcessor::Status FillingTransform::prepare() +{ + if (input.isFinished() && !output.isFinished() && !has_input && !generate_suffix) + { + should_insert_first = next_row < filling_row; + + for (size_t i = 0; i < filling_row.size(); ++i) + next_row[i] = filling_row.getFillDescription(i).fill_to; + + if (filling_row < next_row) + { + generate_suffix = true; + return Status::Ready; + } + } + + return ISimpleTransform::prepare(); +} + + +void FillingTransform::transform(Chunk & chunk) +{ + Columns old_fill_columns; + Columns old_other_columns; + MutableColumns res_fill_columns; + MutableColumns res_other_columns; + + auto init_columns_by_positions = [](const Columns & old_columns, Columns & new_columns, + MutableColumns & new_mutable_columns, const Positions & positions) + { + for (size_t pos : positions) + { + new_columns.push_back(old_columns[pos]); + new_mutable_columns.push_back(old_columns[pos]->cloneEmpty()->assumeMutable()); + } + }; + + if (generate_suffix) + { + const auto & empty_columns = inputs.front().getHeader().getColumns(); + init_columns_by_positions(empty_columns, old_fill_columns, res_fill_columns, fill_column_positions); + init_columns_by_positions(empty_columns, old_other_columns, res_other_columns, other_column_positions); + + if (should_insert_first && filling_row < next_row) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + + while (filling_row.next(next_row)) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + + setResultColumns(chunk, res_fill_columns, res_other_columns); + return; + } + + size_t num_rows = chunk.getNumRows(); + auto old_columns = chunk.detachColumns(); + + init_columns_by_positions(old_columns, old_fill_columns, res_fill_columns, fill_column_positions); + init_columns_by_positions(old_columns, old_other_columns, res_other_columns, other_column_positions); + + if (first) + { + for (size_t i = 0; i < filling_row.size(); ++i) + { + auto current_value = (*old_fill_columns[i])[0]; + const auto & fill_from = filling_row.getFillDescription(i).fill_from; + + if (!fill_from.isNull() && !equals(current_value, fill_from)) + { + filling_row.initFromDefaults(i); + if (less(fill_from, current_value, filling_row.getDirection(i))) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + break; + } + filling_row[i] = current_value; + } + first = false; + } + + for (size_t row_ind = 0; row_ind < num_rows; ++row_ind) + { + should_insert_first = next_row < filling_row; + + for (size_t i = 0; i < filling_row.size(); ++i) + { + auto current_value = (*old_fill_columns[i])[row_ind]; + const auto & fill_to = filling_row.getFillDescription(i).fill_to; + + if (fill_to.isNull() || less(current_value, fill_to, filling_row.getDirection(i))) + next_row[i] = current_value; + else + next_row[i] = fill_to; + } + + /// A case, when at previous step row was initialized from defaults 'fill_from' values + /// and probably we need to insert it to block. + if (should_insert_first && filling_row < next_row) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + + /// Insert generated filling row to block, while it is less than current row in block. + while (filling_row.next(next_row)) + insertFromFillingRow(res_fill_columns, res_other_columns, filling_row); + + copyRowFromColumns(res_fill_columns, old_fill_columns, row_ind); + copyRowFromColumns(res_other_columns, old_other_columns, row_ind); + } + + setResultColumns(chunk, res_fill_columns, res_other_columns); +} + +void FillingTransform::setResultColumns(Chunk & chunk, MutableColumns & fill_columns, MutableColumns & other_columns) const +{ + MutableColumns result_columns(fill_columns.size() + other_columns.size()); + /// fill_columns always non-empty. + size_t num_rows = fill_columns[0]->size(); + + for (size_t i = 0; i < fill_columns.size(); ++i) + result_columns[fill_column_positions[i]] = std::move(fill_columns[i]); + for (size_t i = 0; i < other_columns.size(); ++i) + result_columns[other_column_positions[i]] = std::move(other_columns[i]); + + chunk.setColumns(std::move(result_columns), num_rows); +} + +} diff --git a/dbms/src/Processors/Transforms/FillingTransform.h b/dbms/src/Processors/Transforms/FillingTransform.h new file mode 100644 index 00000000000..5c4c78701f5 --- /dev/null +++ b/dbms/src/Processors/Transforms/FillingTransform.h @@ -0,0 +1,42 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +/** Implements modifier WITH FILL of ORDER BY clause. + * It fills gaps in data stream by rows with missing values in columns with set WITH FILL and deafult values in other columns. + * Optionally FROM, TO and STEP values can be specified. + */ +class FillingTransform : public ISimpleTransform +{ +public: + FillingTransform(const Block & header_, const SortDescription & fill_description_); + + String getName() const override { return "FillingTransform"; } + + Status prepare() override; + +protected: + void transform(Chunk & Chunk) override; + +private: + void setResultColumns(Chunk & chunk, MutableColumns & fill_columns, MutableColumns & other_columns) const; + + const SortDescription sort_description; /// Contains only rows with WITH FILL. + FillingRow filling_row; /// Current row, which is used to fill gaps. + FillingRow next_row; /// Row to which we need to generate filling rows. + + using Positions = std::vector; + Positions fill_column_positions; + Positions other_column_positions; + bool first = true; + bool generate_suffix = false; + + /// Determines should we insert filling row before start generating next rows. + bool should_insert_first = false; +}; + +} diff --git a/dbms/src/Processors/Transforms/MergingSortedTransform.h b/dbms/src/Processors/Transforms/MergingSortedTransform.h index f1175c8d347..0991835bfaf 100644 --- a/dbms/src/Processors/Transforms/MergingSortedTransform.h +++ b/dbms/src/Processors/Transforms/MergingSortedTransform.h @@ -2,46 +2,13 @@ #include #include #include +#include #include namespace DB { -/// Allows you refer to the row in the block and hold the block ownership, -/// and thus avoid creating a temporary row object. -/// Do not use std::shared_ptr, since there is no need for a place for `weak_count` and `deleter`; -/// does not use Poco::SharedPtr, since you need to allocate a block and `refcount` in one piece; -/// does not use Poco::AutoPtr, since it does not have a `move` constructor and there are extra checks for nullptr; -/// The reference counter is not atomic, since it is used from one thread. -namespace detail -{ -struct SharedChunk : Chunk -{ - int refcount = 0; - - ColumnRawPtrs all_columns; - ColumnRawPtrs sort_columns; - - SharedChunk(Chunk && chunk) : Chunk(std::move(chunk)) {} -}; - -} - -using SharedChunkPtr = boost::intrusive_ptr; - - -inline void intrusive_ptr_add_ref(detail::SharedChunk * ptr) -{ - ++ptr->refcount; -} - -inline void intrusive_ptr_release(detail::SharedChunk * ptr) -{ - if (0 == --ptr->refcount) - delete ptr; -} - class MergingSortedTransform : public IProcessor { public: diff --git a/dbms/src/Storages/StorageFile.cpp b/dbms/src/Storages/StorageFile.cpp index cfd14c58a2d..1427306bc02 100644 --- a/dbms/src/Storages/StorageFile.cpp +++ b/dbms/src/Storages/StorageFile.cpp @@ -18,12 +18,19 @@ #include #include +#include #include #include #include +#include +#include +#include + +namespace fs = std::filesystem; + namespace DB { @@ -39,6 +46,54 @@ namespace ErrorCodes extern const int EMPTY_LIST_OF_COLUMNS_PASSED; } +namespace +{ +/* Recursive directory listing with matched paths as a result. + * Have the same method in StorageHDFS. + */ +std::vector LSWithRegexpMatching(const std::string & path_for_ls, const std::string & for_match) +{ + const size_t first_glob = for_match.find_first_of("*?{"); + + const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); + const std::string suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' + + const size_t next_slash = suffix_with_globs.find('/', 1); + re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); + + std::vector result; + const std::string prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); + if (!fs::exists(fs::path(prefix_without_globs.data()))) + { + return result; + } + const fs::directory_iterator end; + for (fs::directory_iterator it(prefix_without_globs); it != end; ++it) + { + const std::string full_path = it->path().string(); + const size_t last_slash = full_path.rfind('/'); + const String file_name = full_path.substr(last_slash); + const bool looking_for_directory = next_slash != std::string::npos; + /// Condition is_directory means what kind of path is it in current iteration of ls + if (!fs::is_directory(it->path()) && !looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + result.push_back(it->path().string()); + } + } + else if (fs::is_directory(it->path()) && looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + Strings result_part = LSWithRegexpMatching(full_path + "/", suffix_with_globs.substr(next_slash)); + std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); + } + } + } + return result; +} +} static std::string getTablePath(const std::string & db_dir_path, const std::string & table_name, const std::string & format_name) { @@ -90,8 +145,10 @@ StorageFile::StorageFile( if (poco_path.isRelative()) poco_path = Poco::Path(db_dir_path, poco_path); - path = poco_path.absolute().toString(); - checkCreationIsAllowed(context_global, db_dir_path, path, table_fd); + const std::string path = poco_path.absolute().toString(); + paths = LSWithRegexpMatching("/", path); + for (const auto & cur_path : paths) + checkCreationIsAllowed(context_global, db_dir_path, cur_path, table_fd); is_db_table = false; } else /// Is DB's file @@ -99,14 +156,18 @@ StorageFile::StorageFile( if (db_dir_path.empty()) throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME); - path = getTablePath(db_dir_path, table_name, format_name); + paths = {getTablePath(db_dir_path, table_name, format_name)}; is_db_table = true; - Poco::File(Poco::Path(path).parent()).createDirectories(); + Poco::File(Poco::Path(paths.back()).parent()).createDirectories(); } } else /// Will use FD { - checkCreationIsAllowed(context_global, db_dir_path, path, table_fd); + + if (paths.size() != 1) + throw Exception("Table '" + table_name + "' is in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED); + + checkCreationIsAllowed(context_global, db_dir_path, paths[0], table_fd); is_db_table = false; use_table_fd = true; @@ -121,7 +182,7 @@ StorageFile::StorageFile( class StorageFileBlockInputStream : public IBlockInputStream { public: - StorageFileBlockInputStream(StorageFile & storage_, const Context & context, UInt64 max_block_size) + StorageFileBlockInputStream(StorageFile & storage_, const Context & context, UInt64 max_block_size, std::string file_path) : storage(storage_) { if (storage.use_table_fd) @@ -147,8 +208,7 @@ public: else { shared_lock = std::shared_lock(storage.rwlock); - - read_buf = std::make_unique(storage.path); + read_buf = std::make_unique(file_path); } reader = FormatFactory::instance().getInput(storage.format_name, *read_buf, storage.getSampleBlock(), context, max_block_size); @@ -195,12 +255,16 @@ BlockInputStreams StorageFile::read( size_t max_block_size, unsigned /*num_streams*/) { - BlockInputStreamPtr block_input = std::make_shared(*this, context, max_block_size); const ColumnsDescription & columns_ = getColumns(); auto column_defaults = columns_.getDefaults(); - if (column_defaults.empty()) - return {block_input}; - return {std::make_shared(block_input, column_defaults, context)}; + BlockInputStreams blocks_input; + blocks_input.reserve(paths.size()); + for (const auto & file_path : paths) + { + BlockInputStreamPtr cur_block = std::make_shared(*this, context, max_block_size, file_path); + blocks_input.push_back(column_defaults.empty() ? cur_block : std::make_shared(cur_block, column_defaults, context)); + } + return blocks_input; } @@ -210,6 +274,8 @@ public: explicit StorageFileBlockOutputStream(StorageFile & storage_) : storage(storage_), lock(storage.rwlock) { + if (storage.paths.size() != 1) + throw Exception("Table '" + storage.table_name + "' is in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED); if (storage.use_table_fd) { /** NOTE: Using real file binded to FD may be misleading: @@ -221,7 +287,7 @@ public: } else { - write_buf = std::make_unique(storage.path, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); + write_buf = std::make_unique(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); } writer = FormatFactory::instance().getOutput(storage.format_name, *write_buf, storage.getSampleBlock(), storage.context_global); @@ -263,19 +329,28 @@ BlockOutputStreamPtr StorageFile::write( return std::make_shared(*this); } +String StorageFile::getDataPath() const +{ + if (paths.empty()) + throw Exception("Table '" + table_name + "' is in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED); + return paths[0]; +} void StorageFile::rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name, TableStructureWriteLockHolder &) { if (!is_db_table) throw Exception("Can't rename table '" + table_name + "' binded to user-defined file (or FD)", ErrorCodes::DATABASE_ACCESS_DENIED); + if (paths.size() != 1) + throw Exception("Can't rename table '" + table_name + "' in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED); + std::unique_lock lock(rwlock); std::string path_new = getTablePath(new_path_to_db, new_table_name, format_name); Poco::File(Poco::Path(path_new).parent()).createDirectories(); - Poco::File(path).renameTo(path_new); + Poco::File(paths[0]).renameTo(path_new); - path = std::move(path_new); + paths[0] = std::move(path_new); table_name = new_table_name; database_name = new_database_name; } diff --git a/dbms/src/Storages/StorageFile.h b/dbms/src/Storages/StorageFile.h index 1410cc5f215..aaf659edc62 100644 --- a/dbms/src/Storages/StorageFile.h +++ b/dbms/src/Storages/StorageFile.h @@ -40,7 +40,7 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name, TableStructureWriteLockHolder &) override; - String getDataPath() const override { return path; } + String getDataPath() const override; protected: friend class StorageFileBlockInputStream; @@ -68,9 +68,10 @@ private: std::string format_name; Context & context_global; - std::string path; int table_fd = -1; + std::vector paths{""}; + bool is_db_table = true; /// Table is stored in real database, not user's file bool use_table_fd = false; /// Use table_fd insted of path std::atomic table_fd_was_used{false}; /// To detect repeating reads from stdin diff --git a/dbms/src/Storages/StorageHDFS.cpp b/dbms/src/Storages/StorageHDFS.cpp index cb25580248f..2a029463706 100644 --- a/dbms/src/Storages/StorageHDFS.cpp +++ b/dbms/src/Storages/StorageHDFS.cpp @@ -9,12 +9,17 @@ #include #include #include +#include #include #include #include #include #include - +#include +#include +#include +#include +#include namespace DB { @@ -129,6 +134,51 @@ private: BlockOutputStreamPtr writer; }; +/* Recursive directory listing with matched paths as a result. + * Have the same method in StorageFile. + */ +Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) +{ + const size_t first_glob = for_match.find_first_of("*?{"); + + const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); + const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' + const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' + + const size_t next_slash = suffix_with_globs.find('/', 1); + re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); + + HDFSFileInfo ls; + ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); + Strings result; + for (int i = 0; i < ls.length; ++i) + { + const String full_path = String(ls.file_info[i].mName); + const size_t last_slash = full_path.rfind('/'); + const String file_name = full_path.substr(last_slash); + const bool looking_for_directory = next_slash != std::string::npos; + const bool is_directory = ls.file_info[i].mKind == 'D'; + /// Condition with type of current file_info means what kind of path is it in current iteration of ls + if (!is_directory && !looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + result.push_back(String(ls.file_info[i].mName)); + } + } + else if (is_directory && looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + Strings result_part = LSWithRegexpMatching(full_path + "/", fs, suffix_with_globs.substr(next_slash)); + std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); + } + } + } + + return result; +} + } @@ -140,12 +190,22 @@ BlockInputStreams StorageHDFS::read( size_t max_block_size, unsigned /*num_streams*/) { - return {std::make_shared( - uri, - format_name, - getSampleBlock(), - context_, - max_block_size)}; + const size_t begin_of_path = uri.find('/', uri.find("//") + 2); + const String path_from_uri = uri.substr(begin_of_path); + const String uri_without_path = uri.substr(0, begin_of_path); + + HDFSBuilderPtr builder = createHDFSBuilder(uri_without_path + "/"); + HDFSFSPtr fs = createHDFSFS(builder.get()); + + const Strings res_paths = LSWithRegexpMatching("/", fs, path_from_uri); + BlockInputStreams result; + for (const auto & res_path : res_paths) + { + result.push_back(std::make_shared(uri_without_path + res_path, format_name, getSampleBlock(), context_, + max_block_size)); + } + + return result; } void StorageHDFS::rename(const String & /*new_path_to_db*/, const String & new_database_name, const String & new_table_name, TableStructureWriteLockHolder &) diff --git a/dbms/tests/integration/test_globs_in_filepath/__init__.py b/dbms/tests/integration/test_globs_in_filepath/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/integration/test_globs_in_filepath/test.py b/dbms/tests/integration/test_globs_in_filepath/test.py new file mode 100644 index 00000000000..db794c35d47 --- /dev/null +++ b/dbms/tests/integration/test_globs_in_filepath/test.py @@ -0,0 +1,115 @@ +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node') +path_to_userfiles_from_defaut_config = "/var/lib/clickhouse/user_files/" # should be the same as in config file + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + +def test_strange_filenames(start_cluster): + # 2 rows data + some_data = "\t111.222\nData\t333.444" + + node.exec_in_container(['bash', '-c', 'mkdir {}strange_names/'.format(path_to_userfiles_from_defaut_config)], privileged=True, user='root') + + files = ["p.o.i.n.t.s", + "b}{ra{ces", + "b}.o{t.h"] + + # filename inside testing data for debug simplicity + for filename in files: + node.exec_in_container(['bash', '-c', 'echo "{}{}" > {}strange_names/{}'.format(filename, some_data, path_to_userfiles_from_defaut_config, filename)], privileged=True, user='root') + + test_requests = [("p.o.??n.t.s", "2"), + ("p.o.*t.s", "2"), + ("b}{r?{ces", "2"), + ("b}*ces", "2"), + ("b}.?{t.h", "2")] + + for pattern, value in test_requests: + assert node.query(''' + select count(*) from file('strange_names/{}', 'TSV', 'text String, number Float64') + '''.format(pattern)) == '{}\n'.format(value) + assert node.query(''' + select count(*) from file('{}strange_names/{}', 'TSV', 'text String, number Float64') + '''.format(path_to_userfiles_from_defaut_config, pattern)) == '{}\n'.format(value) + +def test_linear_structure(start_cluster): + # 2 rows data + some_data = "\t123.456\nData\t789.012" + + files = ["file1", "file2", "file3", "file4", "file5", + "file000", "file111", "file222", "file333", "file444", + "a_file", "b_file", "c_file", "d_file", "e_file", + "a_data", "b_data", "c_data", "d_data", "e_data"] + + # filename inside testing data for debug simplicity + for filename in files: + node.exec_in_container(['bash', '-c', 'echo "{}{}" > {}{}'.format(filename, some_data, path_to_userfiles_from_defaut_config, filename)], privileged=True, user='root') + + test_requests = [("file{0..9}", "10"), + ("file?", "10"), + ("file{0..9}{0..9}{0..9}", "10"), + ("file???", "10"), + ("file*", "20"), + ("a_{file,data}", "4"), + ("?_{file,data}", "20"), + ("{a,b,c,d,e}_{file,data}", "20"), + ("{a,b,c,d,e}?{file,data}", "20"), + ("*", "40")] + + for pattern, value in test_requests: + assert node.query(''' + select count(*) from file('{}', 'TSV', 'text String, number Float64') + '''.format(pattern)) == '{}\n'.format(value) + assert node.query(''' + select count(*) from file('{}{}', 'TSV', 'text String, number Float64') + '''.format(path_to_userfiles_from_defaut_config, pattern)) == '{}\n'.format(value) + +def test_deep_structure(start_cluster): + # 2 rows data + some_data = "\t135.791\nData\t246.802" + dirs = ["directory1/", "directory2/", "some_more_dir/", "we/", + "directory1/big_dir/", + "directory1/dir1/", "directory1/dir2/", "directory1/dir3/", + "directory2/dir1/", "directory2/dir2/", "directory2/one_more_dir/", + "some_more_dir/yet_another_dir/", + "we/need/", "we/need/to/", "we/need/to/go/", "we/need/to/go/deeper/"] + + for dir in dirs: + node.exec_in_container(['bash', '-c', 'mkdir {}{}'.format(path_to_userfiles_from_defaut_config, dir)], privileged=True, user='root') + + # all directories appeared in files must be listed in dirs + files = [] + for i in range(10): + for j in range(10): + for k in range(10): + files.append("directory1/big_dir/file"+str(i)+str(j)+str(k)) + + for dir in dirs: + files.append(dir+"file") + + # filename inside testing data for debug simplicity + for filename in files: + node.exec_in_container(['bash', '-c', 'echo "{}{}" > {}{}'.format(filename, some_data, path_to_userfiles_from_defaut_config, filename)], privileged=True, user='root') + + test_requests = [ ("directory{1..5}/big_dir/*", "2002"), ("directory{0..6}/big_dir/*{0..9}{0..9}{0..9}", "2000"), + ("?", "0"), + ("directory{0..5}/dir{1..3}/file", "10"), ("directory{0..5}/dir?/file", "10"), + ("we/need/to/go/deeper/file", "2"), ("*/*/*/*/*/*", "2"), ("we/need/??/go/deeper/*?*?*?*?*", "2")] + + for pattern, value in test_requests: + assert node.query(''' + select count(*) from file('{}', 'TSV', 'text String, number Float64') + '''.format(pattern)) == '{}\n'.format(value) + assert node.query(''' + select count(*) from file('{}{}', 'TSV', 'text String, number Float64') + '''.format(path_to_userfiles_from_defaut_config, pattern)) == '{}\n'.format(value) \ No newline at end of file diff --git a/dbms/tests/integration/test_storage_hdfs/test.py b/dbms/tests/integration/test_storage_hdfs/test.py index 173c2d77b3f..55ef98f6fde 100644 --- a/dbms/tests/integration/test_storage_hdfs/test.py +++ b/dbms/tests/integration/test_storage_hdfs/test.py @@ -28,15 +28,31 @@ def started_cluster(): cluster.shutdown() def test_read_write_storage(started_cluster): - hdfs_api = HDFSApi("root") hdfs_api.write_data("/simple_storage", "1\tMark\t72.53\n") - assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" - node1.query("create table SimpleHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/simple_storage', 'TSV')") + node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)") + assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" assert node1.query("select * from SimpleHDFSStorage") == "1\tMark\t72.53\n" +def test_read_write_storage_with_globs(started_cluster): + hdfs_api = HDFSApi("root") + + for i in ["1", "2", "3"]: + hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n") + assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n" + + node1.query("create table HDFSStorageWithRange (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1..5}', 'TSV')") + node1.query("create table HDFSStorageWithEnum (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1,2,3,4,5}', 'TSV')") + node1.query("create table HDFSStorageWithQuestionMark (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage?', 'TSV')") + node1.query("create table HDFSStorageWithAsterisk (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage*', 'TSV')") + + assert node1.query("select count(*) from HDFSStorageWithRange") == '3\n' + assert node1.query("select count(*) from HDFSStorageWithEnum") == '3\n' + assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == '3\n' + assert node1.query("select count(*) from HDFSStorageWithAsterisk") == '3\n' + def test_read_write_table(started_cluster): hdfs_api = HDFSApi("root") data = "1\tSerialize\t555.222\n2\tData\t777.333\n" @@ -74,3 +90,27 @@ def test_bad_hdfs_uri(started_cluster): except Exception as ex: print ex assert 'Unable to open HDFS file' in str(ex) + +def test_globs_in_read_table(started_cluster): + hdfs_api = HDFSApi("root") + some_data = "1\tSerialize\t555.222\n2\tData\t777.333\n" + globs_dir = "/dir_for_test_with_globs/" + files = ["dir1/dir_dir/file1", "dir2/file2", "simple_table_function", "dir/file", "some_dir/dir1/file", "some_dir/dir2/file", "some_dir/file", "table1_function", "table2_function", "table3_function"] + for filename in files: + hdfs_api.write_data(globs_dir + filename, some_data) + + test_requests = [("dir{1..5}/dir_dir/file1", 1), + ("*_table_functio?", 1), + ("dir/fil?", 1), + ("table{3..8}_function", 1), + ("table{2..8}_function", 2), + ("dir/*", 1), + ("dir/*?*?*?*?*", 1), + ("dir/*?*?*?*?*?*", 0), + ("some_dir/*/file", 2), + ("some_dir/dir?/*", 2), + ("*/*/*", 3), + ("?", 0)] + + for pattern, value in test_requests: + assert node1.query("select * from hdfs('hdfs://hdfs1:9000" + globs_dir + pattern + "', 'TSV', 'id UInt64, text String, number Float64')") == value * some_data \ No newline at end of file diff --git a/dbms/tests/queries/0_stateless/00975_sample_prewhere.reference b/dbms/tests/queries/0_stateless/00975_sample_prewhere.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/00975_sample_prewhere.sql b/dbms/tests/queries/0_stateless/00975_sample_prewhere.sql new file mode 100644 index 00000000000..a64eb5d0edf --- /dev/null +++ b/dbms/tests/queries/0_stateless/00975_sample_prewhere.sql @@ -0,0 +1,7 @@ +create table if not exists sample_prewhere (date Date, id Int32, time Int64) engine = MergeTree partition by date order by (id, time, intHash64(time)) sample by intHash64(time); + +insert into sample_prewhere values ('2019-01-01', 2, toDateTime('2019-07-20 00:00:01')); +insert into sample_prewhere values ('2019-01-01', 1, toDateTime('2019-07-20 00:00:02')); +insert into sample_prewhere values ('2019-01-02', 3, toDateTime('2019-07-20 00:00:03')); + +select id from remote('127.0.0.{1,3}', currentDatabase(), sample_prewhere) SAMPLE 1 where toDateTime(time) = '2019-07-20 00:00:00'; diff --git a/dbms/tests/queries/0_stateless/00995_order_by_with_fill.reference b/dbms/tests/queries/0_stateless/00995_order_by_with_fill.reference new file mode 100644 index 00000000000..adb0e1aa2c3 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00995_order_by_with_fill.reference @@ -0,0 +1,510 @@ +*** table without fill to compare *** +2019-05-07 18 prh +2019-05-07 26 2ke +2019-05-08 28 otf +2019-05-09 25 798 +2019-05-10 1 myj +2019-05-10 16 vp7 +2019-05-11 18 3s2 +2019-05-15 27 enb +2019-05-19 20 yfh +2019-05-23 15 01v +2019-05-23 29 72y +2019-05-24 13 sd0 +2019-05-25 17 0ei +2019-05-30 18 3kd +2019-06-04 5 6az +*** date WITH FILL, val *** +2019-05-07 18 prh +2019-05-07 26 2ke +2019-05-08 28 otf +2019-05-09 25 798 +2019-05-10 1 myj +2019-05-10 16 vp7 +2019-05-11 18 3s2 +2019-05-12 0 +2019-05-13 0 +2019-05-14 0 +2019-05-15 27 enb +2019-05-16 0 +2019-05-17 0 +2019-05-18 0 +2019-05-19 20 yfh +2019-05-20 0 +2019-05-21 0 +2019-05-22 0 +2019-05-23 15 01v +2019-05-23 29 72y +2019-05-24 13 sd0 +2019-05-25 17 0ei +2019-05-26 0 +2019-05-27 0 +2019-05-28 0 +2019-05-29 0 +2019-05-30 18 3kd +2019-05-31 0 +2019-06-01 0 +2019-06-02 0 +2019-06-03 0 +2019-06-04 5 6az +*** date WITH FILL FROM 2019-05-01 TO 2019-05-31, val WITH FILL *** +2019-05-01 0 +2019-05-02 0 +2019-05-03 0 +2019-05-04 0 +2019-05-05 0 +2019-05-06 0 +2019-05-07 18 prh +2019-05-07 19 +2019-05-07 20 +2019-05-07 21 +2019-05-07 22 +2019-05-07 23 +2019-05-07 24 +2019-05-07 25 +2019-05-07 26 2ke +2019-05-08 28 otf +2019-05-09 25 798 +2019-05-10 1 myj +2019-05-10 2 +2019-05-10 3 +2019-05-10 4 +2019-05-10 5 +2019-05-10 6 +2019-05-10 7 +2019-05-10 8 +2019-05-10 9 +2019-05-10 10 +2019-05-10 11 +2019-05-10 12 +2019-05-10 13 +2019-05-10 14 +2019-05-10 15 +2019-05-10 16 vp7 +2019-05-11 18 3s2 +2019-05-12 0 +2019-05-13 0 +2019-05-14 0 +2019-05-15 27 enb +2019-05-16 0 +2019-05-17 0 +2019-05-18 0 +2019-05-19 20 yfh +2019-05-20 0 +2019-05-21 0 +2019-05-22 0 +2019-05-23 15 01v +2019-05-23 16 +2019-05-23 17 +2019-05-23 18 +2019-05-23 19 +2019-05-23 20 +2019-05-23 21 +2019-05-23 22 +2019-05-23 23 +2019-05-23 24 +2019-05-23 25 +2019-05-23 26 +2019-05-23 27 +2019-05-23 28 +2019-05-23 29 72y +2019-05-24 13 sd0 +2019-05-25 17 0ei +2019-05-26 0 +2019-05-27 0 +2019-05-28 0 +2019-05-29 0 +2019-05-30 18 3kd +2019-06-04 5 6az +*** date DESC WITH FILL, val WITH FILL FROM 1 TO 6 *** +2019-06-04 1 +2019-06-04 2 +2019-06-04 3 +2019-06-04 4 +2019-06-04 5 6az +2019-06-03 1 +2019-06-03 2 +2019-06-03 3 +2019-06-03 4 +2019-06-03 5 +2019-06-02 1 +2019-06-02 2 +2019-06-02 3 +2019-06-02 4 +2019-06-02 5 +2019-06-01 1 +2019-06-01 2 +2019-06-01 3 +2019-06-01 4 +2019-06-01 5 +2019-05-31 1 +2019-05-31 2 +2019-05-31 3 +2019-05-31 4 +2019-05-31 5 +2019-05-30 1 +2019-05-30 2 +2019-05-30 3 +2019-05-30 4 +2019-05-30 5 +2019-05-30 18 3kd +2019-05-29 1 +2019-05-29 2 +2019-05-29 3 +2019-05-29 4 +2019-05-29 5 +2019-05-28 1 +2019-05-28 2 +2019-05-28 3 +2019-05-28 4 +2019-05-28 5 +2019-05-27 1 +2019-05-27 2 +2019-05-27 3 +2019-05-27 4 +2019-05-27 5 +2019-05-26 1 +2019-05-26 2 +2019-05-26 3 +2019-05-26 4 +2019-05-26 5 +2019-05-25 1 +2019-05-25 2 +2019-05-25 3 +2019-05-25 4 +2019-05-25 5 +2019-05-25 17 0ei +2019-05-24 1 +2019-05-24 2 +2019-05-24 3 +2019-05-24 4 +2019-05-24 5 +2019-05-24 13 sd0 +2019-05-23 1 +2019-05-23 2 +2019-05-23 3 +2019-05-23 4 +2019-05-23 5 +2019-05-23 15 01v +2019-05-23 29 72y +2019-05-22 1 +2019-05-22 2 +2019-05-22 3 +2019-05-22 4 +2019-05-22 5 +2019-05-21 1 +2019-05-21 2 +2019-05-21 3 +2019-05-21 4 +2019-05-21 5 +2019-05-20 1 +2019-05-20 2 +2019-05-20 3 +2019-05-20 4 +2019-05-20 5 +2019-05-19 1 +2019-05-19 2 +2019-05-19 3 +2019-05-19 4 +2019-05-19 5 +2019-05-19 20 yfh +2019-05-18 1 +2019-05-18 2 +2019-05-18 3 +2019-05-18 4 +2019-05-18 5 +2019-05-17 1 +2019-05-17 2 +2019-05-17 3 +2019-05-17 4 +2019-05-17 5 +2019-05-16 1 +2019-05-16 2 +2019-05-16 3 +2019-05-16 4 +2019-05-16 5 +2019-05-15 1 +2019-05-15 2 +2019-05-15 3 +2019-05-15 4 +2019-05-15 5 +2019-05-15 27 enb +2019-05-14 1 +2019-05-14 2 +2019-05-14 3 +2019-05-14 4 +2019-05-14 5 +2019-05-13 1 +2019-05-13 2 +2019-05-13 3 +2019-05-13 4 +2019-05-13 5 +2019-05-12 1 +2019-05-12 2 +2019-05-12 3 +2019-05-12 4 +2019-05-12 5 +2019-05-11 1 +2019-05-11 2 +2019-05-11 3 +2019-05-11 4 +2019-05-11 5 +2019-05-11 18 3s2 +2019-05-10 1 myj +2019-05-10 2 +2019-05-10 3 +2019-05-10 4 +2019-05-10 5 +2019-05-10 16 vp7 +2019-05-09 1 +2019-05-09 2 +2019-05-09 3 +2019-05-09 4 +2019-05-09 5 +2019-05-09 25 798 +2019-05-08 1 +2019-05-08 2 +2019-05-08 3 +2019-05-08 4 +2019-05-08 5 +2019-05-08 28 otf +2019-05-07 1 +2019-05-07 2 +2019-05-07 3 +2019-05-07 4 +2019-05-07 5 +2019-05-07 18 prh +2019-05-07 26 2ke +*** date DESC WITH FILL TO 2019-05-01 STEP -2, val DESC WITH FILL FROM 10 TO -5 STEP -3 *** +2019-06-04 10 +2019-06-04 7 +2019-06-04 5 6az +2019-06-04 4 +2019-06-04 1 +2019-06-04 -2 +2019-06-02 10 +2019-06-02 7 +2019-06-02 4 +2019-06-02 1 +2019-06-02 -2 +2019-05-31 10 +2019-05-31 7 +2019-05-31 4 +2019-05-31 1 +2019-05-31 -2 +2019-05-30 18 3kd +2019-05-29 10 +2019-05-29 7 +2019-05-29 4 +2019-05-29 1 +2019-05-29 -2 +2019-05-27 10 +2019-05-27 7 +2019-05-27 4 +2019-05-27 1 +2019-05-27 -2 +2019-05-25 17 0ei +2019-05-25 10 +2019-05-25 7 +2019-05-25 4 +2019-05-25 1 +2019-05-25 -2 +2019-05-24 13 sd0 +2019-05-23 29 72y +2019-05-23 15 01v +2019-05-23 10 +2019-05-23 7 +2019-05-23 4 +2019-05-23 1 +2019-05-23 -2 +2019-05-21 10 +2019-05-21 7 +2019-05-21 4 +2019-05-21 1 +2019-05-21 -2 +2019-05-19 20 yfh +2019-05-19 10 +2019-05-19 7 +2019-05-19 4 +2019-05-19 1 +2019-05-19 -2 +2019-05-17 10 +2019-05-17 7 +2019-05-17 4 +2019-05-17 1 +2019-05-17 -2 +2019-05-15 27 enb +2019-05-15 10 +2019-05-15 7 +2019-05-15 4 +2019-05-15 1 +2019-05-15 -2 +2019-05-13 10 +2019-05-13 7 +2019-05-13 4 +2019-05-13 1 +2019-05-13 -2 +2019-05-11 18 3s2 +2019-05-11 10 +2019-05-11 7 +2019-05-11 4 +2019-05-11 1 +2019-05-11 -2 +2019-05-10 16 vp7 +2019-05-10 1 myj +2019-05-09 25 798 +2019-05-09 10 +2019-05-09 7 +2019-05-09 4 +2019-05-09 1 +2019-05-09 -2 +2019-05-08 28 otf +2019-05-07 26 2ke +2019-05-07 18 prh +2019-05-07 10 +2019-05-07 7 +2019-05-07 4 +2019-05-07 1 +2019-05-07 -2 +2019-05-05 10 +2019-05-05 7 +2019-05-05 4 +2019-05-05 1 +2019-05-05 -2 +2019-05-03 10 +2019-05-03 7 +2019-05-03 4 +2019-05-03 1 +2019-05-03 -2 +2019-05-01 10 +2019-05-01 7 +2019-05-01 4 +2019-05-01 1 +2019-05-01 -2 +*** date WITH FILL TO 2019-06-23 STEP 3, val WITH FILL FROM -10 STEP 2 +2019-05-07 -10 +2019-05-07 -8 +2019-05-07 -6 +2019-05-07 -4 +2019-05-07 -2 +2019-05-07 0 +2019-05-07 2 +2019-05-07 4 +2019-05-07 6 +2019-05-07 8 +2019-05-07 10 +2019-05-07 12 +2019-05-07 14 +2019-05-07 16 +2019-05-07 18 prh +2019-05-07 20 +2019-05-07 22 +2019-05-07 24 +2019-05-07 26 2ke +2019-05-08 28 otf +2019-05-09 25 798 +2019-05-10 -10 +2019-05-10 -8 +2019-05-10 -6 +2019-05-10 -4 +2019-05-10 -2 +2019-05-10 0 +2019-05-10 1 myj +2019-05-10 2 +2019-05-10 4 +2019-05-10 6 +2019-05-10 8 +2019-05-10 10 +2019-05-10 12 +2019-05-10 14 +2019-05-10 16 vp7 +2019-05-11 18 3s2 +2019-05-13 -10 +2019-05-15 27 enb +2019-05-16 -10 +2019-05-19 -10 +2019-05-19 -8 +2019-05-19 -6 +2019-05-19 -4 +2019-05-19 -2 +2019-05-19 0 +2019-05-19 2 +2019-05-19 4 +2019-05-19 6 +2019-05-19 8 +2019-05-19 10 +2019-05-19 12 +2019-05-19 14 +2019-05-19 16 +2019-05-19 18 +2019-05-19 20 yfh +2019-05-22 -10 +2019-05-23 15 01v +2019-05-23 29 72y +2019-05-24 13 sd0 +2019-05-25 -10 +2019-05-25 -8 +2019-05-25 -6 +2019-05-25 -4 +2019-05-25 -2 +2019-05-25 0 +2019-05-25 2 +2019-05-25 4 +2019-05-25 6 +2019-05-25 8 +2019-05-25 10 +2019-05-25 12 +2019-05-25 14 +2019-05-25 16 +2019-05-25 17 0ei +2019-05-28 -10 +2019-05-30 18 3kd +2019-05-31 -10 +2019-06-03 -10 +2019-06-04 5 6az +2019-06-06 -10 +2019-06-09 -10 +2019-06-12 -10 +2019-06-15 -10 +2019-06-18 -10 +2019-06-21 -10 +*** table without fill to compare *** +1 -2 +1 3 +3 2 +5 -1 +6 5 +8 0 +*** a WITH FILL, b WITH fill *** +1 -2 +1 -1 +1 0 +1 1 +1 2 +1 3 +2 0 +3 2 +4 0 +5 -1 +6 5 +7 0 +8 0 +*** a WITH FILL, b WITH fill TO 6 STEP 2 *** +1 -2 +1 0 +1 2 +1 3 +1 4 +2 0 +3 2 +3 4 +4 0 +5 -1 +5 1 +5 3 +5 5 +6 5 +7 0 +8 0 +8 2 +8 4 diff --git a/dbms/tests/queries/0_stateless/00995_order_by_with_fill.sql b/dbms/tests/queries/0_stateless/00995_order_by_with_fill.sql new file mode 100644 index 00000000000..7f7f85bdb5b --- /dev/null +++ b/dbms/tests/queries/0_stateless/00995_order_by_with_fill.sql @@ -0,0 +1,45 @@ +DROP TABLE IF EXISTS fill; +CREATE TABLE fill (date Date, val Int, str String) ENGINE = Memory; +INSERT INTO fill VALUES (toDate('2019-05-24'), 13, 'sd0')(toDate('2019-05-10'), 16, 'vp7')(toDate('2019-05-25'), 17, '0ei')(toDate('2019-05-30'), 18, '3kd')(toDate('2019-05-15'), 27, 'enb')(toDate('2019-06-04'), 5, '6az')(toDate('2019-05-23'), 15, '01v')(toDate('2019-05-08'), 28, 'otf')(toDate('2019-05-19'), 20, 'yfh')(toDate('2019-05-07'), 26, '2ke')(toDate('2019-05-07'), 18, 'prh')(toDate('2019-05-09'), 25, '798')(toDate('2019-05-10'), 1, 'myj')(toDate('2019-05-11'), 18, '3s2')(toDate('2019-05-23'), 29, '72y'); + +SELECT '*** table without fill to compare ***'; +SELECT * FROM fill ORDER BY date, val; + +-- Some useful cases + +SELECT '*** date WITH FILL, val ***'; +SELECT * FROM fill ORDER BY date WITH FILL, val; + +SELECT '*** date WITH FILL FROM 2019-05-01 TO 2019-05-31, val WITH FILL ***'; +SELECT * FROM fill ORDER BY date WITH FILL FROM toDate('2019-05-01') TO toDate('2019-05-31'), val WITH FILL; + +SELECT '*** date DESC WITH FILL, val WITH FILL FROM 1 TO 6 ***'; +SELECT * FROM fill ORDER BY date DESC WITH FILL, val WITH FILL FROM 1 TO 6; + +-- Some weird cases + +SELECT '*** date DESC WITH FILL TO 2019-05-01 STEP -2, val DESC WITH FILL FROM 10 TO -5 STEP -3 ***'; +SELECT * FROM fill ORDER BY date DESC WITH FILL TO toDate('2019-05-01') STEP -2, val DESC WITH FILL FROM 10 TO -5 STEP -3; + +SELECT '*** date WITH FILL TO 2019-06-23 STEP 3, val WITH FILL FROM -10 STEP 2'; +SELECT * FROM fill ORDER BY date WITH FILL TO toDate('2019-06-23') STEP 3, val WITH FILL FROM -10 STEP 2; + +DROP TABLE fill; +CREATE TABLE fill (a UInt32, b Int32) ENGINE = Memory; +INSERT INTO fill VALUES (1, -2), (1, 3), (3, 2), (5, -1), (6, 5), (8, 0); + +SELECT '*** table without fill to compare ***'; +SELECT * FROM fill ORDER BY a, b; + +SELECT '*** a WITH FILL, b WITH fill ***'; +SELECT * FROM fill ORDER BY a WITH FILL, b WITH fill; + +SELECT '*** a WITH FILL, b WITH fill TO 6 STEP 2 ***'; +SELECT * FROM fill ORDER BY a WITH FILL, b WITH fill TO 6 STEP 2; + +SELECT * FROM fill ORDER BY a WITH FILL STEP -1; -- { serverError 475 } +SELECT * FROM fill ORDER BY a WITH FILL FROM 10 TO 1; -- { serverError 475 } +SELECT * FROM fill ORDER BY a DESC WITH FILL FROM 1 TO 10; -- { serverError 475 } +SELECT * FROM fill ORDER BY a WITH FILL FROM -10 to 10; -- { serverError 475 } + +DROP TABLE fill; diff --git a/dbms/tests/queries/0_stateless/00996_limit_with_ties.reference b/dbms/tests/queries/0_stateless/00996_limit_with_ties.reference new file mode 100644 index 00000000000..aa5d102bc9b --- /dev/null +++ b/dbms/tests/queries/0_stateless/00996_limit_with_ties.reference @@ -0,0 +1,52 @@ +1 +1 +* +1 +1 +2 +2 +2 +2 +* +1 +1 +2 +2 +2 +2 +* +1 +* +1 +2 +2 +2 +2 +* +1 +1 +* +2 +2 +2 +2 +* +1 +1 +2 +2 +2 +2 +* +1 +1 +* +2 +2 +2 +2 +* +2 +2 +2 +* diff --git a/dbms/tests/queries/0_stateless/00996_limit_with_ties.sql b/dbms/tests/queries/0_stateless/00996_limit_with_ties.sql new file mode 100644 index 00000000000..3e4813bc6b5 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00996_limit_with_ties.sql @@ -0,0 +1,35 @@ +DROP TABLE IF EXISTS ties; +CREATE TABLE ties (a Int) ENGINE = Memory; + +-- SET experimental_use_processors=1; + +INSERT INTO ties VALUES (1), (1), (2), (2), (2), (2) (3), (3); + +SELECT a FROM ties order by a limit 1 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 3 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 5 with ties; +SELECT '*'; + +SET max_block_size = 2; +SELECT a FROM ties order by a limit 1, 1 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 1, 2 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 2 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 2, 3 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 4 with ties; +SELECT '*'; + +SET max_block_size = 3; +SELECT a FROM ties order by a limit 1 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 2, 3 with ties; +SELECT '*'; +SELECT a FROM ties order by a limit 3, 2 with ties; +SELECT '*'; + +DROP TABLE ties; diff --git a/docker/test/stateful_with_coverage/run.sh b/docker/test/stateful_with_coverage/run.sh index a6d2ba0e9e8..d521632f98a 100755 --- a/docker/test/stateful_with_coverage/run.sh +++ b/docker/test/stateful_with_coverage/run.sh @@ -47,6 +47,7 @@ ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/con ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/; \ + ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/; \ ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/; \ ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/; \ diff --git a/docker/test/stateless_with_coverage/run.sh b/docker/test/stateless_with_coverage/run.sh index ccf3e53f715..26e230573d5 100755 --- a/docker/test/stateless_with_coverage/run.sh +++ b/docker/test/stateless_with_coverage/run.sh @@ -49,6 +49,7 @@ ln -s /usr/share/clickhouse-test/config/zookeeper.xml /etc/clickhouse-server/con ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/metric_log.xml /etc/clickhouse-server/config.d/; \ + ln -s /usr/share/clickhouse-test/config/query_masking_rules.xml /etc/clickhouse-server/config.d/; \ ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/users.d/; \ ln -s /usr/share/clickhouse-test/config/readonly.xml /etc/clickhouse-server/users.d/; \ ln -s /usr/share/clickhouse-test/config/ints_dictionary.xml /etc/clickhouse-server/; \ diff --git a/docs/en/development/build.md b/docs/en/development/build.md index 854562e191d..02cea936c70 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -40,7 +40,7 @@ sudo apt-get install git cmake ninja-build Or cmake3 instead of cmake on older systems. -## Install GCC 8 +## Install GCC 9 There are several ways to do this. @@ -50,18 +50,18 @@ There are several ways to do this. sudo apt-get install software-properties-common sudo apt-add-repository ppa:ubuntu-toolchain-r/test sudo apt-get update -sudo apt-get install gcc-8 g++-8 +sudo apt-get install gcc-9 g++-9 ``` ### Install from Sources Look at [utils/ci/build-gcc-from-sources.sh](https://github.com/yandex/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) -## Use GCC 8 for Builds +## Use GCC 9 for Builds ```bash -export CC=gcc-8 -export CXX=g++-8 +export CC=gcc-9 +export CXX=g++-9 ``` ## Install Required Libraries from Packages diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 251f42cf592..ed36e79fbc0 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -11,7 +11,7 @@ The supported formats are: | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Template](#template) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | | [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | @@ -121,7 +121,7 @@ During parsing, the first and second rows are completely ignored. This format is also available under the name `TSVWithNamesAndTypes`. -## Template {#template} +## Template {#format-template} This format allows to specify a custom format string with placeholders for values with specified escaping rule. @@ -934,7 +934,7 @@ You can select data from a ClickHouse table and save them into some file in the clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` -To exchange data with the Hadoop, you can use `HDFS` table engine. +To exchange data with the Hadoop, you can use [`HDFS` table engine](../../operations/table_engines/hdfs.md). ## Format Schema {#formatschema} diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 9c711073177..6f6201feb24 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -74,7 +74,7 @@ If `force_primary_key=1`, ClickHouse checks to see if the query has a primary ke ## format_schema -This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) or [Template](https://clickhouse.yandex/docs/en/interfaces/formats/#template). The value depends on the format. +This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) or [Template](../../interfaces/formats.md#format-template). The value depends on the format. ## fsync_metadata diff --git a/docs/en/operations/table_engines/file.md b/docs/en/operations/table_engines/file.md index 9e3c5b3400b..bd7ee3cb90e 100644 --- a/docs/en/operations/table_engines/file.md +++ b/docs/en/operations/table_engines/file.md @@ -27,7 +27,7 @@ When creating table using `File(Format)` it creates empty subdirectory in that f You may manually create this subfolder and file in server filesystem and then [ATTACH](../../query_language/misc.md) it to table information with matching name, so you can query data from that file. !!! warning - Be careful with this funcionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. + Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. **Example:** @@ -73,9 +73,9 @@ $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64 - Multiple `SELECT` queries can be performed concurrently, but `INSERT` queries will wait each other. - Not supported: - - `ALTER` - - `SELECT ... SAMPLE` - - Indices - - Replication + - `ALTER` + - `SELECT ... SAMPLE` + - Indices + - Replication [Original article](https://clickhouse.yandex/docs/en/operations/table_engines/file/) diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md new file mode 100644 index 00000000000..652ca43b176 --- /dev/null +++ b/docs/en/operations/table_engines/hdfs.md @@ -0,0 +1,51 @@ +# HDFS {#table_engines-hdfs} + +This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.htmll)via ClickHouse. This engine is similar +to the [File](file.md) and [URL](url.md) engines, but provides Hadoop-specific features. + +## Usage + +``` +ENGINE = HDFS(URI, format) +``` +The `URI` parameter is the whole file URI in HDFS. +The `format` parameter specifies one of the available file formats. To perform +`SELECT` queries, the format must be supported for input, and to perform +`INSERT` queries -- for output. The available formats are listed in the +[Formats](../../interfaces/formats.md#formats) section. + +**Example:** + +**1.** Set up the `hdfs_engine_table` table: + +``` sql +CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') +``` + +**2.** Fill file: +``` sql +INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) +``` + +**3.** Query the data: + +``` sql +SELECT * FROM hdfs_engine_table LIMIT 2 +``` + +``` +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` + +## Implementation Details + +- Reads and writes can be parallel +- Not supported: + - `ALTER` and `SELECT...SAMPLE` operations. + - Indexes. + - Replication. + +[Original article](https://clickhouse.yandex/docs/en/operations/table_engines/hdfs/) diff --git a/docs/en/operations/table_engines/index.md b/docs/en/operations/table_engines/index.md index ce8cf5b4678..8da7288b35a 100644 --- a/docs/en/operations/table_engines/index.md +++ b/docs/en/operations/table_engines/index.md @@ -45,6 +45,7 @@ Engines in the family: - [MySQL](mysql.md) - [ODBC](odbc.md) - [JDBC](jdbc.md) +- [HDFS](hdfs.md) ### Special engines diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index a3646d3d0e9..7c694a1612c 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -313,6 +313,48 @@ INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100 INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 ``` +#### Functions Support + +Conditions in the `WHERE` clause contain calls of functions over the columns. If the column is a part of some index, ClickHouse tries to use this index when performing the functions. ClickHouse supports different subset of functions for using indexes. + +The `set` index can be used with all functions. Functions subsets for other indexes are in the table below. + +Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter +----------------------------|-------------|--------|------------|------------|--------------- +[equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ +[notEquals(!=, <>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ +[like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ +[notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ +[startsWith](../../query_language/functions/string_functions.md#function-startswith) | ✔ | ✔ | ✔ | ✔ | ✗ +[endsWith](../../query_language/functions/string_functions.md#function-endswith) | ✗ | ✗ | ✔ | ✔ | +[multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✔ | ✗ +[in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ +[notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ +[less (<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ +[greater (>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ +[lessOrEquals (<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ +[greaterOrEquals (>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ +[empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ +[notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ +hasToken | ✗ | ✗ | ✗ | ✔ | ✗ + +Functions with a constant argument less than ngram size couldn't be used by `ngrambf_v1` for the query optimization. + +Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, `bloom_filter` indexes couldn't be used for optimizing queries where the result of a function is expected to be false, for example: + +- Can be optimized: + - `s LIKE '%test%'` + - `NOT s NOT LIKE '%test%'` + - `s = 1` + - `NOT s != 1` + - `startsWith(s, 'test')` +- Can't be optimized: + - `NOT s LIKE '%test%'` + - `s NOT LIKE '%test%'` + - `NOT s = 1` + - `s != 1` + - `NOT startsWith(s, 'test')` + ## Concurrent Data Access For concurrent table access, we use multi-versioning. In other words, when a table is simultaneously read and updated, data is read from a set of parts that is current at the time of the query. There are no lengthy locks. Inserts do not get in the way of read operations. diff --git a/docs/en/query_language/functions/array_functions.md b/docs/en/query_language/functions/array_functions.md index 94d79a1898b..5065d428994 100644 --- a/docs/en/query_language/functions/array_functions.md +++ b/docs/en/query_language/functions/array_functions.md @@ -1,12 +1,12 @@ # Functions for working with arrays -## empty +## empty {#function-empty} Returns 1 for an empty array, or 0 for a non-empty array. The result type is UInt8. The function also works for strings. -## notEmpty +## notEmpty {#function-notempty} Returns 0 for an empty array, or 1 for a non-empty array. The result type is UInt8. @@ -73,7 +73,7 @@ Get the element with the index `n` from the array `arr`. `n` must be any integer Indexes in an array begin from one. Negative indexes are supported. In this case, it selects the corresponding element numbered from the end. For example, `arr[-1]` is the last item in the array. -If the index falls outside of the bounds of an array, it returns some default value (0 for numbers, an empty string for strings, etc.). +If the index falls outside of the bounds of an array, it returns some default value (0 for numbers, an empty string for strings, etc.), except for the case with a non-constant array and a constant index 0 (in this case there will be an error `Array indices are 1-based`). ## has(arr, elem) diff --git a/docs/en/query_language/functions/comparison_functions.md b/docs/en/query_language/functions/comparison_functions.md index 39987ef2893..337a213673d 100644 --- a/docs/en/query_language/functions/comparison_functions.md +++ b/docs/en/query_language/functions/comparison_functions.md @@ -17,17 +17,17 @@ Strings are compared by bytes. A shorter string is smaller than all strings that Note. Up until version 1.1.54134, signed and unsigned numbers were compared the same way as in C++. In other words, you could get an incorrect result in cases like SELECT 9223372036854775807 > -1. This behavior changed in version 1.1.54134 and is now mathematically correct. -## equals, a = b and a == b operator +## equals, a = b and a == b operator {#function-equals} -## notEquals, a ! operator= b and a `<>` b +## notEquals, a ! operator= b and a `<>` b {#function-notequals} -## less, `< operator` +## less, `< operator` {#function-less} -## greater, `> operator` +## greater, `> operator` {#function-greater} -## lessOrEquals, `<= operator` +## lessOrEquals, `<= operator` {#function-lessorequals} -## greaterOrEquals, `>= operator` +## greaterOrEquals, `>= operator` {#function-greaterorequals} [Original article](https://clickhouse.yandex/docs/en/query_language/functions/comparison_functions/) diff --git a/docs/en/query_language/functions/in_functions.md b/docs/en/query_language/functions/in_functions.md index e7b355bb76c..5886dcc3bc7 100644 --- a/docs/en/query_language/functions/in_functions.md +++ b/docs/en/query_language/functions/in_functions.md @@ -1,6 +1,6 @@ # Functions for implementing the IN operator -## in, notIn, globalIn, globalNotIn +## in, notIn, globalIn, globalNotIn {#in-functions} See the section [IN operators](../select.md#select-in-operators). diff --git a/docs/en/query_language/functions/string_functions.md b/docs/en/query_language/functions/string_functions.md index b2ef05d4c3e..1eca9c0e815 100644 --- a/docs/en/query_language/functions/string_functions.md +++ b/docs/en/query_language/functions/string_functions.md @@ -145,11 +145,11 @@ Decode base64-encoded string 's' into original string. In case of failure raises ## tryBase64Decode(s) Similar to base64Decode, but in case of error an empty string would be returned. -## endsWith(s, suffix) +## endsWith(s, suffix) {#function-endswith} Returns whether to end with the specified suffix. Returns 1 if the string ends with the specified suffix, otherwise it returns 0. -## startsWith(s, prefix) +## startsWith(s, prefix) {#function-startswith} Returns whether to start with the specified prefix. Returns 1 if the string starts with the specified prefix, otherwise it returns 0. diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index 86f038b6a1e..723b8edc154 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -33,7 +33,7 @@ Returns the index `i` (starting from 1) of the leftmost found needlei For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) +## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) {#function-multisearchany} Returns 1, if at least one string needlei matches the string `haystack` and 0 otherwise. @@ -86,7 +86,7 @@ Extracts a fragment of a string using a regular expression. If 'haystack' doesn' Extracts all the fragments of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the 'extract' function (it takes the first subpattern, or the entire expression if there isn't a subpattern). -## like(haystack, pattern), haystack LIKE pattern operator +## like(haystack, pattern), haystack LIKE pattern operator {#function-like} Checks whether a string matches a simple regular expression. The regular expression can contain the metasymbols `%` and `_`. @@ -100,7 +100,7 @@ Use the backslash (`\`) for escaping metasymbols. See the note on escaping in th For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function. For other regular expressions, the code is the same as for the 'match' function. -## notLike(haystack, pattern), haystack NOT LIKE pattern operator +## notLike(haystack, pattern), haystack NOT LIKE pattern operator {#function-notlike} The same thing as 'like', but negative. diff --git a/docs/en/query_language/table_functions/file.md b/docs/en/query_language/table_functions/file.md index ff9ba31cb28..0cb1f0d36bf 100644 --- a/docs/en/query_language/table_functions/file.md +++ b/docs/en/query_language/table_functions/file.md @@ -9,9 +9,9 @@ file(path, format, structure) **Input parameters** -- `path` — The relative path to the file from [user_files_path](../../operations/server_settings/settings.md#server_settings-user_files_path). +- `path` — The relative path to the file from [user_files_path](../../operations/server_settings/settings.md#server_settings-user_files_path). Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, ``'abc', 'def'` — strings. - `format` — The [format](../../interfaces/formats.md#formats) of the file. -- `structure` — Structure of the table. Format `'colunmn1_name column1_ype, column2_name column2_type, ...'`. +- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. **Returned value** @@ -51,4 +51,16 @@ LIMIT 2 SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10 ``` +**Globs in path** + +- `*` — Matches any number of any characters including none. +- `?` — Matches any single character. +- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Matches any number in range from N to M including both borders. + +!!! warning + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. + +Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. + [Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/file/) diff --git a/docs/en/query_language/table_functions/hdfs.md b/docs/en/query_language/table_functions/hdfs.md new file mode 100644 index 00000000000..cce9b308101 --- /dev/null +++ b/docs/en/query_language/table_functions/hdfs.md @@ -0,0 +1,49 @@ + +# hdfs + +Creates a table from a file in HDFS. + +``` +hdfs(URI, format, structure) +``` + +**Input parameters** + +- `URI` — The relative URI to the file in HDFS. Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, ``'abc', 'def'` — strings. +- `format` — The [format](../../interfaces/formats.md#formats) of the file. +- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. + +**Returned value** + +A table with the specified structure for reading or writing data in the specified file. + +**Example** + +Table from `hdfs://hdfs1:9000/test` and selection of the first two rows from it: + +```sql +SELECT * +FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') +LIMIT 2 +``` + +``` +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +**Globs in path** + +- `*` — Matches any number of any characters including none. +- `?` — Matches any single character. +- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Matches any number in range from N to M including both borders. + +!!! warning + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. + +Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. + +[Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/hdfs/) diff --git a/docs/fa/interfaces/formats.md b/docs/fa/interfaces/formats.md index a8c91c73b8a..d01dc1d1be0 100644 --- a/docs/fa/interfaces/formats.md +++ b/docs/fa/interfaces/formats.md @@ -12,6 +12,8 @@ Format | INSERT | SELECT [TabSeparatedRaw](formats.md#tabseparatedraw) | ✗ | ✔ | [TabSeparatedWithNames](formats.md#tabseparatedwithnames) | ✔ | ✔ | [TabSeparatedWithNamesAndTypes](formats.md#tabseparatedwithnamesandtypes) | ✔ | ✔ | +[Template](#format-template) | ✔ | ✔ | +[TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | [CSV](formats.md#csv) | ✔ | ✔ | [CSVWithNames](formats.md#csvwithnames) | ✔ | ✔ | [Values](formats.md#data-format-values) | ✔ | ✔ | @@ -115,6 +117,122 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD همچنین این فرمت تحت عنوان ` TSVWithNamesAndTypes`وجود دارد. + +## Template {#format-template} + +This format allows to specify a custom format string with placeholders for values with specified escaping rule. + +It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) + +Format string `format_schema_rows` specifies rows format with the following syntax: + + `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, + + where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), + `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), + `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: + + - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) + - `Escaped` (similarly to `TSV`) + - `Quoted` (similarly to `Values`) + - `Raw` (without escaping, similarly to `TSVRaw`) + - `None` (no escaping rule, see further) + + If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. + + So, for the following format string: + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + + the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + + The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) + +Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: + + - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. + - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) + - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) + - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) + - `rows` is the total number of output rows + - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. + - `time` is the request execution time in seconds + - `rows_read` is the number of rows have been read + - `bytes_read` is the number of bytes (uncompressed) have been read + + The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. + If the `format_schema` setting is an empty string, `${data}` is used as default value. + For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). + + `Select` example: +```sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 +FORMAT Template +SETTINGS format_schema = ' + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + +', +format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', +format_schema_rows_between_delimiter = '\n ' +``` +```html + + Search phrases + + + + + + + + +
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
yandex 1655
spring 2014 fashion 1549
freeform photos 1480
+ + +
Max
8873898
+ Processed 3095973 rows in 0.1569913 sec + + +``` + +`Insert` example: +``` +Some header +Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 +Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 +Total rows: 2 +``` +```sql +INSERT INTO UserActivity FORMAT Template SETTINGS +format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', +format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' +``` +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. +All delimiters in the input data must be strictly equal to delimiters in specified format strings. + +## TemplateIgnoreSpaces {#templateignorespaces} + +This format is suitable only for input. +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): +```sql +INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS +format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', +format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', +format_schema_rows_between_delimiter = ',' +``` + ## TSKV مشابه فرمت TabSeparated، اما خروجی به صورت name=value می باشد. نام ها مشابه روش TabSeparated، escape می شوند، و همچنین = symbol هم escape می شود. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 8f9d100ad91..15f7552f877 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -10,7 +10,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Template](#template) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | | [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | @@ -120,7 +120,7 @@ world Этот формат также доступен под именем `TSVWithNamesAndTypes`. -## Template {#template} +## Template {#format-template} Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. @@ -927,7 +927,7 @@ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parq clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` -Для обмена данными с экосистемой Hadoop можно использовать движки таблиц `HDFS` и `URL`. +Для обмена данными с экосистемой Hadoop можно использовать движки таблиц [`HDFS`](../../operations/table_engines/hdfs.md) и `URL`. ## Схема формата {#formatschema} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 831f5958c29..c3518eb7f74 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -72,7 +72,7 @@ ClickHouse применяет настройку в тех случаях, ко ## format_schema -Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) или [Template](../../interfaces/formats.md#template). Значение параметра зависит от формата. +Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) или [Template](../../interfaces/formats.md#format-template). Значение параметра зависит от формата. ## fsync_metadata diff --git a/docs/ru/operations/table_engines/hdfs.md b/docs/ru/operations/table_engines/hdfs.md new file mode 100644 index 00000000000..3f42c9ec447 --- /dev/null +++ b/docs/ru/operations/table_engines/hdfs.md @@ -0,0 +1,48 @@ +# HDFS {#table_engines-hdfs} + +Управляет данными в HDFS. Данный движок похож на движок [File](file.md) и на движок [URL](url.md). + +## Использование движка + +``` +ENGINE = HDFS(URI, format) +``` + +В параметр `URI` нужно передавать полный URI файла в HDFS. +Параметр `format` должен быть таким, который ClickHouse может использовать и в запросах `INSERT`, и в запросах `SELECT`. Полный список поддерживаемых форматов смотрите в разделе [Форматы](../../interfaces/formats.md#formats). + +**Пример:** + +**1.** Создадим на сервере таблицу `hdfs_engine_table`: + +``` sql +CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') +``` + +**2.** Заполним файл: +``` sql +INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) +``` + +**3.** Запросим данные: + +``` sql +SELECT * FROM hdfs_engine_table LIMIT 2 +``` + +``` +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` + +## Детали реализации + +- Поддерживается многопоточное чтение и запись. +- Не поддерживается: + - использование операций `ALTER` и `SELECT...SAMPLE`; + - индексы; + - репликация. + +[Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/table_engines/hdfs/) diff --git a/docs/ru/query_language/functions/array_functions.md b/docs/ru/query_language/functions/array_functions.md index 7945276434f..11d5c819b02 100644 --- a/docs/ru/query_language/functions/array_functions.md +++ b/docs/ru/query_language/functions/array_functions.md @@ -74,7 +74,7 @@ SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res Индексы в массиве начинаются с единицы. Поддерживаются отрицательные индексы. В этом случае, будет выбран соответствующий по номеру элемент с конца. Например, arr\[-1\] - последний элемент массива. -Если индекс выходит за границы массива, то возвращается некоторое значение по умолчанию (0 для чисел, пустая строка для строк и т. п.). +Если индекс выходит за границы массива, то возвращается некоторое значение по умолчанию (0 для чисел, пустая строка для строк и т. п.), кроме случая с неконстантным массивом и константным индексом 0 (в этом случае будет ошибка `Array indices are 1-based`). ## has(arr, elem) diff --git a/docs/ru/query_language/functions/comparison_functions.md b/docs/ru/query_language/functions/comparison_functions.md index 392c6c2573c..b69e272133b 100644 --- a/docs/ru/query_language/functions/comparison_functions.md +++ b/docs/ru/query_language/functions/comparison_functions.md @@ -18,16 +18,16 @@ Замечание. До версии 1.1.54134 сравнение знаковых и беззнаковых целых чисел производилось также, как в C++. То есть, вы могли получить неверный результат в таких случаях: SELECT 9223372036854775807 > -1. С версии 1.1.54134 поведение изменилось и стало математически корректным. -## equals, оператор a = b и a == b +## equals, оператор a = b и a == b {#function-equals} -## notEquals, оператор a != b и a `<>` b +## notEquals, оператор a != b и a `<>` b {#function-notequals} -## less, оператор `<` +## less, оператор `<` {#function-less} -## greater, оператор `>` +## greater, оператор `>` {#function-greater} -## lessOrEquals, оператор `<=` +## lessOrEquals, оператор `<=` {#function-lessorequals} -## greaterOrEquals, оператор `>=` +## greaterOrEquals, оператор `>=` {#function-greaterorequals} [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/comparison_functions/) diff --git a/docs/ru/query_language/functions/in_functions.md b/docs/ru/query_language/functions/in_functions.md index 7eb87e53a6e..8b4eccd0db6 100644 --- a/docs/ru/query_language/functions/in_functions.md +++ b/docs/ru/query_language/functions/in_functions.md @@ -1,6 +1,7 @@ # Функции для реализации оператора IN. -## in, notIn, globalIn, globalNotIn +## in, notIn, globalIn, globalNotIn {#in-functions} + Смотрите раздел [Операторы IN](../select.md#select-in-operators). ## tuple(x, y, ...), оператор (x, y, ...) diff --git a/docs/ru/query_language/functions/string_functions.md b/docs/ru/query_language/functions/string_functions.md index f427017ee59..cc6563dacd5 100644 --- a/docs/ru/query_language/functions/string_functions.md +++ b/docs/ru/query_language/functions/string_functions.md @@ -117,6 +117,14 @@ SELECT format('{} {}', 'Hello', 'World') ## tryBase64Decode(s) Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. +## endsWith(s, suffix) {#function-endswith} + +Возвращает 1, если строка завершается указанным суффиксом, и 0 в противном случае. + +## startsWith(s, prefix) {#function-startswith} + +Возвращает 1, если строка начинается указанным префиксом, и 0 в противном случае. + ## CRC32(s) Возвращает чексумму CRC32 данной строки. Тип результата - UInt32. diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 3fc500bd203..0f86554b552 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -29,7 +29,7 @@ Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) +## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) {#function-multisearchany} Возвращает 1, если хотя бы одна подстрока needlei нашлась в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. @@ -77,7 +77,7 @@ ## extractAll(haystack, pattern) Извлечение всех фрагментов строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Возвращается массив строк, состоящий из всех соответствий регулярному выражению. В остальном, поведение аналогично функции extract (по прежнему, вынимается первый subpattern, или всё выражение, если subpattern-а нет). -## like(haystack, pattern), оператор haystack LIKE pattern +## like(haystack, pattern), оператор haystack LIKE pattern {#function-like} Проверка строки на соответствие простому регулярному выражению. Регулярное выражение может содержать метасимволы `%` и `_`. @@ -90,7 +90,7 @@ Для регулярных выражений вида `%needle%` действует более оптимальный код, который работает также быстро, как функция `position`. Для остальных регулярных выражений, код аналогичен функции match. -## notLike(haystack, pattern), оператор haystack NOT LIKE pattern +## notLike(haystack, pattern), оператор haystack NOT LIKE pattern {#function-notlike} То же, что like, но с отрицанием. ## ngramDistance(haystack, needle) diff --git a/docs/ru/query_language/table_functions/file.md b/docs/ru/query_language/table_functions/file.md index 0fb16bf5a48..9fc82b151b8 100644 --- a/docs/ru/query_language/table_functions/file.md +++ b/docs/ru/query_language/table_functions/file.md @@ -9,7 +9,7 @@ file(path, format, structure) **Входные параметры** -- `path` — относительный путь до файла от [user_files_path](../../operations/server_settings/settings.md#server_settings-user_files_path). +- `path` — относительный путь до файла от [user_files_path](../../operations/server_settings/settings.md#server_settings-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, ``'abc', 'def'` — строки. - `format` — [формат](../../interfaces/formats.md#formats) файла. - `structure` — структура таблицы. Формат `'colunmn1_name column1_ype, column2_name column2_type, ...'`. @@ -45,4 +45,16 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` +**Шаблоны в пути файла** + +- `*` — Матчит любое количество любых символов, включая отсутствие символов. +- `?` — Матчит ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Матчит любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Матчит любое число в интервале от `N` до `M` включительно. + +!!! warning + Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. + +Шаблоны могут содержаться в разных частях пути. Обрабатываться будут ровно те файлы, которые и удовлетворяют всему шаблону пути, и существуют в файловой системе. + [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/table_functions/file/) diff --git a/docs/ru/query_language/table_functions/hdfs.md b/docs/ru/query_language/table_functions/hdfs.md new file mode 100644 index 00000000000..ae881edea35 --- /dev/null +++ b/docs/ru/query_language/table_functions/hdfs.md @@ -0,0 +1,48 @@ + +# hdfs + +Создаёт таблицу из файла в HDFS. + +``` +hdfs(URI, format, structure) +``` + +**Входные параметры** + +- `URI` — URI файла в HDFS. Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, ``'abc', 'def'` — строки. +- `format` — [формат](../../interfaces/formats.md#formats) файла. +- `structure` — структура таблицы. Формат `'column1_name column1_type, column2_name column2_type, ...'`. + +**Возвращаемое значение** + +Таблица с указанной структурой, предназначенная для чтения или записи данных в указанном файле. + +**Пример** + +Таблица из `hdfs://hdfs1:9000/test` и выборка первых двух строк из неё: + +``` sql +SELECT * +FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') +LIMIT 2 +``` +``` +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +**Шаблоны в пути файла** + +- `*` — Матчит любое количество любых символов, включая отсутствие символов. +- `?` — Матчит ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Матчит любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Матчит любое число в интервале от `N` до `M` включительно. + +!!! warning + Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. + +Шаблоны могут содержаться в разных частях пути. Обрабатываться будут ровно те файлы, которые и удовлетворяют всему шаблону пути, и существуют в файловой системе. + +[Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/table_functions/hdfs/) diff --git a/docs/zh/interfaces/formats.md b/docs/zh/interfaces/formats.md index 65358115295..e4663c2d418 100644 --- a/docs/zh/interfaces/formats.md +++ b/docs/zh/interfaces/formats.md @@ -10,6 +10,8 @@ ClickHouse 可以接受多种数据格式,可以在 (`INSERT`) 以及 (`SELECT | [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | | [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | | [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | | [Values](#data-format-values) | ✔ | ✔ | @@ -115,6 +117,121 @@ world 这种格式也可以使用名称 ` TSVWithNamesAndTypes` 来表示。 +## Template {#format-template} + +This format allows to specify a custom format string with placeholders for values with specified escaping rule. + +It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) + +Format string `format_schema_rows` specifies rows format with the following syntax: + + `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, + + where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), + `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), + `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: + + - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) + - `Escaped` (similarly to `TSV`) + - `Quoted` (similarly to `Values`) + - `Raw` (without escaping, similarly to `TSVRaw`) + - `None` (no escaping rule, see further) + + If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. + + So, for the following format string: + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + + the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + + The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) + +Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: + + - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. + - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) + - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) + - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) + - `rows` is the total number of output rows + - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. + - `time` is the request execution time in seconds + - `rows_read` is the number of rows have been read + - `bytes_read` is the number of bytes (uncompressed) have been read + + The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. + If the `format_schema` setting is an empty string, `${data}` is used as default value. + For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). + + `Select` example: +```sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 +FORMAT Template +SETTINGS format_schema = ' + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + +', +format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', +format_schema_rows_between_delimiter = '\n ' +``` +```html + + Search phrases + + + + + + + + +
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
yandex 1655
spring 2014 fashion 1549
freeform photos 1480
+ + +
Max
8873898
+ Processed 3095973 rows in 0.1569913 sec + + +``` + +`Insert` example: +``` +Some header +Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 +Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 +Total rows: 2 +``` +```sql +INSERT INTO UserActivity FORMAT Template SETTINGS +format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', +format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' +``` +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. +All delimiters in the input data must be strictly equal to delimiters in specified format strings. + +## TemplateIgnoreSpaces {#templateignorespaces} + +This format is suitable only for input. +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): +```sql +INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS +format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', +format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', +format_schema_rows_between_delimiter = ',' +``` + ## TSKV {#tskv} 与 `TabSeparated` 格式类似,但它输出的是 `name=value` 的格式。名称会和 `TabSeparated` 格式一样被转义,`=` 字符也会被转义。 diff --git a/utils/github/__main__.py b/utils/github/__main__.py index 836ad734fce..d5de241f25f 100644 --- a/utils/github/__main__.py +++ b/utils/github/__main__.py @@ -47,14 +47,14 @@ parser.add_argument('--login', type=str, args = parser.parse_args() -github = query.Query(args.token, 50) +github = query.Query(args.token, 30) repo = local.Local(args.repo, args.remote, github.get_default_branch()) stables = repo.get_stables()[-args.number:] # [(branch name, base)] if not stables: - sys.exit('No stable branches found!') + sys.exit('No release branches found!') else: - print('Found stable branches:') + print('Found release branches:') for stable in stables: print(f'{CHECK_MARK} {stable[0]} forked from {stable[1]}')