mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 03:22:14 +00:00
Merge branch 'master' into issue-5286
This commit is contained in:
commit
e90bf74ce8
43
CHANGELOG.md
43
CHANGELOG.md
@ -1,3 +1,42 @@
|
|||||||
|
## ClickHouse release 19.11.5.28, 2019-08-05
|
||||||
|
|
||||||
|
### Bug fix
|
||||||
|
* Fixed the possibility of hanging queries when server is overloaded. [#6301](https://github.com/yandex/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
* Fix FPE in yandexConsistentHash function. This fixes [#6304](https://github.com/yandex/ClickHouse/issues/6304). [#6126](https://github.com/yandex/ClickHouse/pull/6126) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/yandex/ClickHouse/issues/6257). [#6281](https://github.com/yandex/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
|
||||||
|
* Fix parsing of `bool` settings from `true` and `false` strings in configuration files. [#6278](https://github.com/yandex/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin))
|
||||||
|
* Fix rare bug with incompatible stream headers in queries to `Distributed` table over `MergeTree` table when part of `WHERE` moves to `PREWHERE`. [#6236](https://github.com/yandex/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin))
|
||||||
|
* Fixed overflow in integer division of signed type to unsigned type. This fixes [#6214](https://github.com/yandex/ClickHouse/issues/6214). [#6233](https://github.com/yandex/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
|
||||||
|
### Backward Incompatible Change
|
||||||
|
* `Kafka` still broken.
|
||||||
|
|
||||||
|
## ClickHouse release 19.11.4.24, 2019-08-01
|
||||||
|
|
||||||
|
### Bug Fix
|
||||||
|
* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/yandex/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin))
|
||||||
|
* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/yandex/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ))
|
||||||
|
* Fixed hang in `JSONExtractRaw` function. Fixed [#6195](https://github.com/yandex/ClickHouse/issues/6195) [#6198](https://github.com/yandex/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
* Fix segfault in ExternalLoader::reloadOutdated(). [#6082](https://github.com/yandex/ClickHouse/pull/6082) ([Vitaly Baranov](https://github.com/vitlibar))
|
||||||
|
* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/yandex/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
* Fixed useless and incorrect condition on update field for initial loading of external dictionaries via ODBC, MySQL, ClickHouse and HTTP. This fixes [#6069](https://github.com/yandex/ClickHouse/issues/6069) [#6083](https://github.com/yandex/ClickHouse/pull/6083) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/yandex/ClickHouse/issues/6094) [#6119](https://github.com/yandex/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai))
|
||||||
|
* Fix non-deterministic result of "uniq" aggregate function in extreme rare cases. The bug was present in all ClickHouse versions. [#6058](https://github.com/yandex/ClickHouse/pull/6058) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
* Segfault when we set a little bit too high CIDR on the function `IPv6CIDRToRange`. [#6068](https://github.com/yandex/ClickHouse/pull/6068) ([Guillaume Tassery](https://github.com/YiuRULE))
|
||||||
|
* Fixed small memory leak when server throw many exceptions from many different contexts. [#6144](https://github.com/yandex/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
* Fix the situation when consumer got paused before subscription and not resumed afterwards. [#6075](https://github.com/yandex/ClickHouse/pull/6075) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version.
|
||||||
|
* Clearing the Kafka data buffer from the previous read operation that was completed with an error [#6026](https://github.com/yandex/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) Note that Kafka is broken in this version.
|
||||||
|
* Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/yandex/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7))
|
||||||
|
|
||||||
|
### Build/Testing/Packaging Improvement
|
||||||
|
* Added official `rpm` packages. [#5740](https://github.com/yandex/ClickHouse/pull/5740) ([proller](https://github.com/proller)) ([alesapin](https://github.com/alesapin))
|
||||||
|
* Add an ability to build `.rpm` and `.tgz` packages with `packager` script. [#5769](https://github.com/yandex/ClickHouse/pull/5769) ([alesapin](https://github.com/alesapin))
|
||||||
|
* Fixes for "Arcadia" build system. [#6223](https://github.com/yandex/ClickHouse/pull/6223) ([proller](https://github.com/proller))
|
||||||
|
|
||||||
|
### Backward Incompatible Change
|
||||||
|
* `Kafka` is broken in this version.
|
||||||
|
|
||||||
|
|
||||||
## ClickHouse release 19.11.3.11, 2019-07-18
|
## ClickHouse release 19.11.3.11, 2019-07-18
|
||||||
|
|
||||||
### New Feature
|
### New Feature
|
||||||
@ -35,6 +74,7 @@
|
|||||||
* clickhouse-copier: Fix use-after free on shutdown [#5752](https://github.com/yandex/ClickHouse/pull/5752) ([proller](https://github.com/proller))
|
* clickhouse-copier: Fix use-after free on shutdown [#5752](https://github.com/yandex/ClickHouse/pull/5752) ([proller](https://github.com/proller))
|
||||||
* Updated `simdjson`. Fixed the issue that some invalid JSONs with zero bytes successfully parse. [#5938](https://github.com/yandex/ClickHouse/pull/5938) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
* Updated `simdjson`. Fixed the issue that some invalid JSONs with zero bytes successfully parse. [#5938](https://github.com/yandex/ClickHouse/pull/5938) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
* Fix shutdown of SystemLogs [#5802](https://github.com/yandex/ClickHouse/pull/5802) ([Anton Popov](https://github.com/CurtizJ))
|
* Fix shutdown of SystemLogs [#5802](https://github.com/yandex/ClickHouse/pull/5802) ([Anton Popov](https://github.com/CurtizJ))
|
||||||
|
* Fix hanging when condition in invalidate_query depends on a dictionary. [#6011](https://github.com/yandex/ClickHouse/pull/6011) ([Vitaly Baranov](https://github.com/vitlibar))
|
||||||
|
|
||||||
### Improvement
|
### Improvement
|
||||||
* Allow unresolvable addresses in cluster configuration. They will be considered unavailable and tried to resolve at every connection attempt. This is especially useful for Kubernetes. This fixes [#5714](https://github.com/yandex/ClickHouse/issues/5714) [#5924](https://github.com/yandex/ClickHouse/pull/5924) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
* Allow unresolvable addresses in cluster configuration. They will be considered unavailable and tried to resolve at every connection attempt. This is especially useful for Kubernetes. This fixes [#5714](https://github.com/yandex/ClickHouse/issues/5714) [#5924](https://github.com/yandex/ClickHouse/pull/5924) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
@ -55,13 +95,12 @@
|
|||||||
* Inverting ngramSearch to be more intuitive [#5807](https://github.com/yandex/ClickHouse/pull/5807) ([Danila Kutenin](https://github.com/danlark1))
|
* Inverting ngramSearch to be more intuitive [#5807](https://github.com/yandex/ClickHouse/pull/5807) ([Danila Kutenin](https://github.com/danlark1))
|
||||||
* Add user parsing in HDFS engine builder [#5946](https://github.com/yandex/ClickHouse/pull/5946) ([akonyaev90](https://github.com/akonyaev90))
|
* Add user parsing in HDFS engine builder [#5946](https://github.com/yandex/ClickHouse/pull/5946) ([akonyaev90](https://github.com/akonyaev90))
|
||||||
* Update default value of `max_ast_elements parameter` [#5933](https://github.com/yandex/ClickHouse/pull/5933) ([Artem Konovalov](https://github.com/izebit))
|
* Update default value of `max_ast_elements parameter` [#5933](https://github.com/yandex/ClickHouse/pull/5933) ([Artem Konovalov](https://github.com/izebit))
|
||||||
|
* Added a notion of obsolete settings. The obsolete setting `allow_experimental_low_cardinality_type` can be used with no effect. [0f15c01c6802f7ce1a1494c12c846be8c98944cd](https://github.com/yandex/ClickHouse/commit/0f15c01c6802f7ce1a1494c12c846be8c98944cd) [Alexey Milovidov](https://github.com/alexey-milovidov)
|
||||||
|
|
||||||
### Performance Improvement
|
### Performance Improvement
|
||||||
* Increase number of streams to SELECT from Merge table for more uniform distribution of threads. Added setting `max_streams_multiplier_for_merge_tables`. This fixes [#5797](https://github.com/yandex/ClickHouse/issues/5797) [#5915](https://github.com/yandex/ClickHouse/pull/5915) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
* Increase number of streams to SELECT from Merge table for more uniform distribution of threads. Added setting `max_streams_multiplier_for_merge_tables`. This fixes [#5797](https://github.com/yandex/ClickHouse/issues/5797) [#5915](https://github.com/yandex/ClickHouse/pull/5915) ([alexey-milovidov](https://github.com/alexey-milovidov))
|
||||||
|
|
||||||
### Build/Testing/Packaging Improvement
|
### Build/Testing/Packaging Improvement
|
||||||
* Added official `rpm` packages. [#5740](https://github.com/yandex/ClickHouse/pull/5740) ([proller](https://github.com/proller)) ([alesapin](https://github.com/alesapin))
|
|
||||||
* Add an ability to build `.rpm` and `.tgz` packages with `packager` script. [#5769](https://github.com/yandex/ClickHouse/pull/5769) ([alesapin](https://github.com/alesapin))
|
|
||||||
* Add a backward compatibility test for client-server interaction with different versions of clickhouse. [#5868](https://github.com/yandex/ClickHouse/pull/5868) ([alesapin](https://github.com/alesapin))
|
* Add a backward compatibility test for client-server interaction with different versions of clickhouse. [#5868](https://github.com/yandex/ClickHouse/pull/5868) ([alesapin](https://github.com/alesapin))
|
||||||
* Test coverage information in every commit and pull request. [#5896](https://github.com/yandex/ClickHouse/pull/5896) ([alesapin](https://github.com/alesapin))
|
* Test coverage information in every commit and pull request. [#5896](https://github.com/yandex/ClickHouse/pull/5896) ([alesapin](https://github.com/alesapin))
|
||||||
* Cooperate with address sanitizer to support our custom allocators (`Arena` and `ArenaWithFreeLists`) for better debugging of "use-after-free" errors. [#5728](https://github.com/yandex/ClickHouse/pull/5728) ([akuzm](https://github.com/akuzm))
|
* Cooperate with address sanitizer to support our custom allocators (`Arena` and `ArenaWithFreeLists`) for better debugging of "use-after-free" errors. [#5728](https://github.com/yandex/ClickHouse/pull/5728) ([akuzm](https://github.com/akuzm))
|
||||||
|
2074
CHANGELOG_RU.md
2074
CHANGELOG_RU.md
File diff suppressed because it is too large
Load Diff
2
contrib/fastops
vendored
2
contrib/fastops
vendored
@ -1 +1 @@
|
|||||||
Subproject commit d2c85c5d6549cfd648a7f31ef7b14341881ff8ae
|
Subproject commit 88752a5e03cf34639a4a37a4b41d8b463fffd2b5
|
@ -3,9 +3,8 @@ set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/fastops)
|
|||||||
set(SRCS "")
|
set(SRCS "")
|
||||||
|
|
||||||
if(HAVE_AVX)
|
if(HAVE_AVX)
|
||||||
set (SRCS ${SRCS} ${LIBRARY_DIR}/fastops/avx/ops_avx.cpp ${LIBRARY_DIR}/fastops/core/FastIntrinsics.cpp)
|
set (SRCS ${SRCS} ${LIBRARY_DIR}/fastops/avx/ops_avx.cpp)
|
||||||
set_source_files_properties(${LIBRARY_DIR}/fastops/avx/ops_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -DNO_AVX2")
|
set_source_files_properties(${LIBRARY_DIR}/fastops/avx/ops_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -DNO_AVX2")
|
||||||
set_source_files_properties(${LIBRARY_DIR}/fastops/core/FastIntrinsics.cpp PROPERTIES COMPILE_FLAGS "-mavx -DNO_AVX2")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(HAVE_AVX2)
|
if(HAVE_AVX2)
|
||||||
|
@ -5,7 +5,6 @@
|
|||||||
#include <DataStreams/copyData.h>
|
#include <DataStreams/copyData.h>
|
||||||
#include <DataTypes/DataTypeFactory.h>
|
#include <DataTypes/DataTypeFactory.h>
|
||||||
#include "ODBCBlockInputStream.h"
|
#include "ODBCBlockInputStream.h"
|
||||||
#include <Formats/BinaryRowInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
#include <Formats/FormatFactory.h>
|
||||||
#include <IO/WriteBufferFromHTTPServerResponse.h>
|
#include <IO/WriteBufferFromHTTPServerResponse.h>
|
||||||
#include <IO/WriteHelpers.h>
|
#include <IO/WriteHelpers.h>
|
||||||
|
@ -299,7 +299,7 @@ void MySQLHandler::authenticate(const HandshakeResponse & handshake_response, co
|
|||||||
}
|
}
|
||||||
|
|
||||||
password.resize(plaintext_size);
|
password.resize(plaintext_size);
|
||||||
for (int i = 0; i < plaintext_size; i++)
|
for (int i = 0; i < plaintext_size; ++i)
|
||||||
{
|
{
|
||||||
password[i] = plaintext[i] ^ static_cast<unsigned char>(scramble[i % scramble.size()]);
|
password[i] = plaintext[i] ^ static_cast<unsigned char>(scramble[i % scramble.size()]);
|
||||||
}
|
}
|
||||||
|
@ -696,6 +696,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
|||||||
|
|
||||||
LOG_INFO(log, "Listening https://" + address.toString());
|
LOG_INFO(log, "Listening https://" + address.toString());
|
||||||
#else
|
#else
|
||||||
|
UNUSED(port);
|
||||||
throw Exception{"HTTPS protocol is disabled because Poco library was built without NetSSL support.",
|
throw Exception{"HTTPS protocol is disabled because Poco library was built without NetSSL support.",
|
||||||
ErrorCodes::SUPPORT_IS_DISABLED};
|
ErrorCodes::SUPPORT_IS_DISABLED};
|
||||||
#endif
|
#endif
|
||||||
@ -732,6 +733,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
|||||||
new Poco::Net::TCPServerParams));
|
new Poco::Net::TCPServerParams));
|
||||||
LOG_INFO(log, "Listening for connections with secure native protocol (tcp_secure): " + address.toString());
|
LOG_INFO(log, "Listening for connections with secure native protocol (tcp_secure): " + address.toString());
|
||||||
#else
|
#else
|
||||||
|
UNUSED(port);
|
||||||
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
|
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
|
||||||
ErrorCodes::SUPPORT_IS_DISABLED};
|
ErrorCodes::SUPPORT_IS_DISABLED};
|
||||||
#endif
|
#endif
|
||||||
@ -768,6 +770,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
|||||||
|
|
||||||
LOG_INFO(log, "Listening for secure replica communication (interserver) https://" + address.toString());
|
LOG_INFO(log, "Listening for secure replica communication (interserver) https://" + address.toString());
|
||||||
#else
|
#else
|
||||||
|
UNUSED(port);
|
||||||
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
|
throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.",
|
||||||
ErrorCodes::SUPPORT_IS_DISABLED};
|
ErrorCodes::SUPPORT_IS_DISABLED};
|
||||||
#endif
|
#endif
|
||||||
@ -788,6 +791,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
|||||||
|
|
||||||
LOG_INFO(log, "Listening for MySQL compatibility protocol: " + address.toString());
|
LOG_INFO(log, "Listening for MySQL compatibility protocol: " + address.toString());
|
||||||
#else
|
#else
|
||||||
|
UNUSED(port);
|
||||||
throw Exception{"SSL support for MySQL protocol is disabled because Poco library was built without NetSSL support.",
|
throw Exception{"SSL support for MySQL protocol is disabled because Poco library was built without NetSSL support.",
|
||||||
ErrorCodes::SUPPORT_IS_DISABLED};
|
ErrorCodes::SUPPORT_IS_DISABLED};
|
||||||
#endif
|
#endif
|
||||||
|
@ -707,7 +707,7 @@ void Dwarf::LineNumberVM::init()
|
|||||||
lineRange_ = read<uint8_t>(header);
|
lineRange_ = read<uint8_t>(header);
|
||||||
opcodeBase_ = read<uint8_t>(header);
|
opcodeBase_ = read<uint8_t>(header);
|
||||||
SAFE_CHECK(opcodeBase_ != 0, "invalid opcode base");
|
SAFE_CHECK(opcodeBase_ != 0, "invalid opcode base");
|
||||||
standardOpcodeLengths_ = reinterpret_cast<const uint8_t *>(header.data());
|
standardOpcodeLengths_ = reinterpret_cast<const uint8_t *>(header.data()); //-V506
|
||||||
header.remove_prefix(opcodeBase_ - 1);
|
header.remove_prefix(opcodeBase_ - 1);
|
||||||
|
|
||||||
// We don't want to use heap, so we don't keep an unbounded amount of state.
|
// We don't want to use heap, so we don't keep an unbounded amount of state.
|
||||||
|
@ -127,7 +127,7 @@ namespace ErrorCodes
|
|||||||
extern const int INCORRECT_DATA = 117;
|
extern const int INCORRECT_DATA = 117;
|
||||||
extern const int ENGINE_REQUIRED = 119;
|
extern const int ENGINE_REQUIRED = 119;
|
||||||
extern const int CANNOT_INSERT_VALUE_OF_DIFFERENT_SIZE_INTO_TUPLE = 120;
|
extern const int CANNOT_INSERT_VALUE_OF_DIFFERENT_SIZE_INTO_TUPLE = 120;
|
||||||
extern const int UNKNOWN_SET_DATA_VARIANT = 121;
|
extern const int UNSUPPORTED_JOIN_KEYS = 121;
|
||||||
extern const int INCOMPATIBLE_COLUMNS = 122;
|
extern const int INCOMPATIBLE_COLUMNS = 122;
|
||||||
extern const int UNKNOWN_TYPE_OF_AST_NODE = 123;
|
extern const int UNKNOWN_TYPE_OF_AST_NODE = 123;
|
||||||
extern const int INCORRECT_ELEMENT_OF_SET = 124;
|
extern const int INCORRECT_ELEMENT_OF_SET = 124;
|
||||||
|
@ -475,6 +475,9 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons
|
|||||||
return on_error("Block structure mismatch in " + context_description + " stream: different types:\n"
|
return on_error("Block structure mismatch in " + context_description + " stream: different types:\n"
|
||||||
+ lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE);
|
+ lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE);
|
||||||
|
|
||||||
|
if (!actual.column || !expected.column)
|
||||||
|
continue;
|
||||||
|
|
||||||
if (actual.column->getName() != expected.column->getName())
|
if (actual.column->getName() != expected.column->getName())
|
||||||
return on_error("Block structure mismatch in " + context_description + " stream: different columns:\n"
|
return on_error("Block structure mismatch in " + context_description + " stream: different columns:\n"
|
||||||
+ lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE);
|
+ lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::BLOCKS_HAVE_DIFFERENT_STRUCTURE);
|
||||||
|
@ -32,7 +32,7 @@ public:
|
|||||||
|
|
||||||
String getName() const override { return "NullAndDoCopy"; }
|
String getName() const override { return "NullAndDoCopy"; }
|
||||||
|
|
||||||
Block getHeader() const override { return {}; }
|
Block getHeader() const override { return input->getHeader(); }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Block readImpl() override
|
Block readImpl() override
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include <DataStreams/AddingDefaultBlockOutputStream.h>
|
#include <DataStreams/AddingDefaultBlockOutputStream.h>
|
||||||
|
#include <DataStreams/ConvertingBlockInputStream.h>
|
||||||
#include <DataStreams/PushingToViewsBlockOutputStream.h>
|
#include <DataStreams/PushingToViewsBlockOutputStream.h>
|
||||||
#include <DataStreams/SquashingBlockInputStream.h>
|
#include <DataStreams/SquashingBlockInputStream.h>
|
||||||
#include <DataTypes/NestedUtils.h>
|
#include <DataTypes/NestedUtils.h>
|
||||||
@ -192,6 +193,7 @@ void PushingToViewsBlockOutputStream::process(const Block & block, size_t view_n
|
|||||||
/// and two-level aggregation is triggered).
|
/// and two-level aggregation is triggered).
|
||||||
in = std::make_shared<SquashingBlockInputStream>(
|
in = std::make_shared<SquashingBlockInputStream>(
|
||||||
in, context.getSettingsRef().min_insert_block_size_rows, context.getSettingsRef().min_insert_block_size_bytes);
|
in, context.getSettingsRef().min_insert_block_size_rows, context.getSettingsRef().min_insert_block_size_bytes);
|
||||||
|
in = std::make_shared<ConvertingBlockInputStream>(context, in, view.out->getHeader(), ConvertingBlockInputStream::MatchColumnsMode::Position);
|
||||||
|
|
||||||
in->readPrefix();
|
in->readPrefix();
|
||||||
|
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
SquashingBlockOutputStream::SquashingBlockOutputStream(BlockOutputStreamPtr & dst, const Block & header, size_t min_block_size_rows, size_t min_block_size_bytes)
|
SquashingBlockOutputStream::SquashingBlockOutputStream(BlockOutputStreamPtr dst, Block header, size_t min_block_size_rows, size_t min_block_size_bytes)
|
||||||
: output(dst), header(header), transform(min_block_size_rows, min_block_size_bytes)
|
: output(std::move(dst)), header(std::move(header)), transform(min_block_size_rows, min_block_size_bytes)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ namespace DB
|
|||||||
class SquashingBlockOutputStream : public IBlockOutputStream
|
class SquashingBlockOutputStream : public IBlockOutputStream
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
SquashingBlockOutputStream(BlockOutputStreamPtr & dst, const Block & header, size_t min_block_size_rows, size_t min_block_size_bytes);
|
SquashingBlockOutputStream(BlockOutputStreamPtr dst, Block header, size_t min_block_size_rows, size_t min_block_size_bytes);
|
||||||
|
|
||||||
Block getHeader() const override { return header; }
|
Block getHeader() const override { return header; }
|
||||||
void write(const Block & block) override;
|
void write(const Block & block) override;
|
||||||
|
@ -1,91 +0,0 @@
|
|||||||
#include <IO/ReadBuffer.h>
|
|
||||||
#include <IO/ReadHelpers.h>
|
|
||||||
#include <Formats/BinaryRowInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
BinaryRowInputStream::BinaryRowInputStream(ReadBuffer & istr_, const Block & header_, bool with_names_, bool with_types_)
|
|
||||||
: istr(istr_), header(header_), with_names(with_names_), with_types(with_types_)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool BinaryRowInputStream::read(MutableColumns & columns, RowReadExtension &)
|
|
||||||
{
|
|
||||||
if (istr.eof())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
size_t num_columns = columns.size();
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
header.getByPosition(i).type->deserializeBinary(*columns[i], istr);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void BinaryRowInputStream::readPrefix()
|
|
||||||
{
|
|
||||||
/// NOTE The header is completely ignored. This can be easily improved.
|
|
||||||
|
|
||||||
UInt64 columns = 0;
|
|
||||||
String tmp;
|
|
||||||
|
|
||||||
if (with_names || with_types)
|
|
||||||
{
|
|
||||||
readVarUInt(columns, istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (with_names)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
readStringBinary(tmp, istr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (with_types)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
readStringBinary(tmp, istr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatRowBinary(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerInputFormat("RowBinary", [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<BinaryRowInputStream>(buf, sample, false, false),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
|
|
||||||
factory.registerInputFormat("RowBinaryWithNamesAndTypes", [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<BinaryRowInputStream>(buf, sample, true, true),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,30 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
#include <Core/Block.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A stream for inputting data in a binary line-by-line format.
|
|
||||||
*/
|
|
||||||
class BinaryRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
BinaryRowInputStream(ReadBuffer & istr_, const Block & sample_, bool with_names_, bool with_types_);
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension &) override;
|
|
||||||
void readPrefix() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
bool with_names;
|
|
||||||
bool with_types;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,77 +0,0 @@
|
|||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <Columns/IColumn.h>
|
|
||||||
#include <DataTypes/IDataType.h>
|
|
||||||
#include <Formats/BinaryRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
BinaryRowOutputStream::BinaryRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_)
|
|
||||||
: ostr(ostr_), with_names(with_names_), with_types(with_types_), sample(sample_)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void BinaryRowOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
size_t columns = sample.columns();
|
|
||||||
|
|
||||||
if (with_names || with_types)
|
|
||||||
{
|
|
||||||
writeVarUInt(columns, ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (with_names)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
writeStringBinary(sample.safeGetByPosition(i).name, ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (with_types)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
writeStringBinary(sample.safeGetByPosition(i).type->getName(), ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void BinaryRowOutputStream::flush()
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void BinaryRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
type.serializeBinary(column, row_num, ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void registerOutputFormatRowBinary(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("RowBinary", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings &)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<BinaryRowOutputStream>(buf, sample, false, false), sample);
|
|
||||||
});
|
|
||||||
|
|
||||||
factory.registerOutputFormat("RowBinaryWithNamesAndTypes", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings &)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<BinaryRowOutputStream>(buf, sample, true, true), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,37 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
#include <Core/Block.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class IColumn;
|
|
||||||
class IDataType;
|
|
||||||
class WriteBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A stream for outputting data in a binary line-by-line format.
|
|
||||||
*/
|
|
||||||
class BinaryRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
BinaryRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writePrefix() override;
|
|
||||||
|
|
||||||
void flush() override;
|
|
||||||
|
|
||||||
String getContentType() const override { return "application/octet-stream"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
bool with_names;
|
|
||||||
bool with_types;
|
|
||||||
const Block sample;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,134 +0,0 @@
|
|||||||
#include <Formats/CSVRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
|
|
||||||
CSVRowOutputStream::CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, const FormatSettings & format_settings)
|
|
||||||
: ostr(ostr_), sample(sample_), with_names(with_names_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
size_t columns = sample.columns();
|
|
||||||
data_types.resize(columns);
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
data_types[i] = sample.safeGetByPosition(i).type;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::flush()
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
size_t columns = sample.columns();
|
|
||||||
|
|
||||||
if (with_names)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
writeCSVString(sample.safeGetByPosition(i).name, ostr);
|
|
||||||
writeChar(i == columns - 1 ? '\n' : format_settings.csv.delimiter, ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
type.serializeAsTextCSV(column, row_num, ostr, format_settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::writeFieldDelimiter()
|
|
||||||
{
|
|
||||||
writeChar(format_settings.csv.delimiter, ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
writeTotals();
|
|
||||||
writeExtremes();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::writeTotals()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
{
|
|
||||||
size_t columns = totals.columns();
|
|
||||||
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
writeRowStartDelimiter();
|
|
||||||
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
if (j != 0)
|
|
||||||
writeFieldDelimiter();
|
|
||||||
writeField(*totals.getByPosition(j).column.get(), *totals.getByPosition(j).type.get(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeRowEndDelimiter();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CSVRowOutputStream::writeExtremes()
|
|
||||||
{
|
|
||||||
if (extremes)
|
|
||||||
{
|
|
||||||
size_t rows = extremes.rows();
|
|
||||||
size_t columns = extremes.columns();
|
|
||||||
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeRowBetweenDelimiter();
|
|
||||||
|
|
||||||
writeRowStartDelimiter();
|
|
||||||
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
if (j != 0)
|
|
||||||
writeFieldDelimiter();
|
|
||||||
writeField(*extremes.getByPosition(j).column.get(), *extremes.getByPosition(j).type.get(), i);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeRowEndDelimiter();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatCSV(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
for (bool with_names : {false, true})
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(with_names ? "CSVWithNames" : "CSV", [=](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<CSVRowOutputStream>(buf, sample, with_names, format_settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,56 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class WriteBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** The stream for outputting data in csv format.
|
|
||||||
* Does not conform with https://tools.ietf.org/html/rfc4180 because it uses LF, not CR LF.
|
|
||||||
*/
|
|
||||||
class CSVRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
/** with_names - output in the first line a header with column names
|
|
||||||
* with_types - output in the next line header with the names of the types
|
|
||||||
*/
|
|
||||||
CSVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeFieldDelimiter() override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
void writePrefix() override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
|
|
||||||
void flush() override;
|
|
||||||
|
|
||||||
void setTotals(const Block & totals_) override { totals = totals_; }
|
|
||||||
void setExtremes(const Block & extremes_) override { extremes = extremes_; }
|
|
||||||
|
|
||||||
/// https://www.iana.org/assignments/media-types/text/csv
|
|
||||||
String getContentType() const override
|
|
||||||
{
|
|
||||||
return String("text/csv; charset=UTF-8; header=") + (with_names ? "present" : "absent");
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void writeTotals();
|
|
||||||
void writeExtremes();
|
|
||||||
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
const Block sample;
|
|
||||||
bool with_names;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
DataTypes data_types;
|
|
||||||
Block totals;
|
|
||||||
Block extremes;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,332 +0,0 @@
|
|||||||
#include "CapnProtoRowInputStream.h"
|
|
||||||
|
|
||||||
#if USE_CAPNP
|
|
||||||
#include <IO/ReadBuffer.h>
|
|
||||||
#include <Interpreters/Context.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
#include <Formats/FormatSchemaInfo.h>
|
|
||||||
#include <capnp/serialize.h>
|
|
||||||
#include <capnp/dynamic.h>
|
|
||||||
#include <capnp/common.h>
|
|
||||||
#include <boost/algorithm/string.hpp>
|
|
||||||
#include <boost/range/join.hpp>
|
|
||||||
#include <common/logger_useful.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int BAD_TYPE_OF_FIELD;
|
|
||||||
extern const int BAD_ARGUMENTS;
|
|
||||||
extern const int THERE_IS_NO_COLUMN;
|
|
||||||
extern const int LOGICAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
CapnProtoRowInputStream::NestedField split(const Block & header, size_t i)
|
|
||||||
{
|
|
||||||
CapnProtoRowInputStream::NestedField field = {{}, i};
|
|
||||||
|
|
||||||
// Remove leading dot in field definition, e.g. ".msg" -> "msg"
|
|
||||||
String name(header.safeGetByPosition(i).name);
|
|
||||||
if (name.size() > 0 && name[0] == '.')
|
|
||||||
name.erase(0, 1);
|
|
||||||
|
|
||||||
boost::split(field.tokens, name, boost::is_any_of("._"));
|
|
||||||
return field;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Field convertNodeToField(capnp::DynamicValue::Reader value)
|
|
||||||
{
|
|
||||||
switch (value.getType())
|
|
||||||
{
|
|
||||||
case capnp::DynamicValue::UNKNOWN:
|
|
||||||
throw Exception("Unknown field type", ErrorCodes::BAD_TYPE_OF_FIELD);
|
|
||||||
case capnp::DynamicValue::VOID:
|
|
||||||
return Field();
|
|
||||||
case capnp::DynamicValue::BOOL:
|
|
||||||
return value.as<bool>() ? 1u : 0u;
|
|
||||||
case capnp::DynamicValue::INT:
|
|
||||||
return value.as<int64_t>();
|
|
||||||
case capnp::DynamicValue::UINT:
|
|
||||||
return value.as<uint64_t>();
|
|
||||||
case capnp::DynamicValue::FLOAT:
|
|
||||||
return value.as<double>();
|
|
||||||
case capnp::DynamicValue::TEXT:
|
|
||||||
{
|
|
||||||
auto arr = value.as<capnp::Text>();
|
|
||||||
return String(arr.begin(), arr.size());
|
|
||||||
}
|
|
||||||
case capnp::DynamicValue::DATA:
|
|
||||||
{
|
|
||||||
auto arr = value.as<capnp::Data>().asChars();
|
|
||||||
return String(arr.begin(), arr.size());
|
|
||||||
}
|
|
||||||
case capnp::DynamicValue::LIST:
|
|
||||||
{
|
|
||||||
auto listValue = value.as<capnp::DynamicList>();
|
|
||||||
Array res(listValue.size());
|
|
||||||
for (auto i : kj::indices(listValue))
|
|
||||||
res[i] = convertNodeToField(listValue[i]);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
case capnp::DynamicValue::ENUM:
|
|
||||||
return value.as<capnp::DynamicEnum>().getRaw();
|
|
||||||
case capnp::DynamicValue::STRUCT:
|
|
||||||
{
|
|
||||||
auto structValue = value.as<capnp::DynamicStruct>();
|
|
||||||
const auto & fields = structValue.getSchema().getFields();
|
|
||||||
|
|
||||||
Field field = Tuple(TupleBackend(fields.size()));
|
|
||||||
TupleBackend & tuple = get<Tuple &>(field).toUnderType();
|
|
||||||
for (auto i : kj::indices(fields))
|
|
||||||
tuple[i] = convertNodeToField(structValue.get(fields[i]));
|
|
||||||
|
|
||||||
return field;
|
|
||||||
}
|
|
||||||
case capnp::DynamicValue::CAPABILITY:
|
|
||||||
throw Exception("CAPABILITY type not supported", ErrorCodes::BAD_TYPE_OF_FIELD);
|
|
||||||
case capnp::DynamicValue::ANY_POINTER:
|
|
||||||
throw Exception("ANY_POINTER type not supported", ErrorCodes::BAD_TYPE_OF_FIELD);
|
|
||||||
}
|
|
||||||
return Field();
|
|
||||||
}
|
|
||||||
|
|
||||||
capnp::StructSchema::Field getFieldOrThrow(capnp::StructSchema node, const std::string & field)
|
|
||||||
{
|
|
||||||
KJ_IF_MAYBE(child, node.findFieldByName(field))
|
|
||||||
return *child;
|
|
||||||
else
|
|
||||||
throw Exception("Field " + field + " doesn't exist in schema " + node.getShortDisplayName().cStr(), ErrorCodes::THERE_IS_NO_COLUMN);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void CapnProtoRowInputStream::createActions(const NestedFieldList & sorted_fields, capnp::StructSchema reader)
|
|
||||||
{
|
|
||||||
/// Columns in a table can map to fields in Cap'n'Proto or to structs.
|
|
||||||
|
|
||||||
/// Store common parents and their tokens in order to backtrack.
|
|
||||||
std::vector<capnp::StructSchema::Field> parents;
|
|
||||||
std::vector<std::string> parent_tokens;
|
|
||||||
|
|
||||||
capnp::StructSchema cur_reader = reader;
|
|
||||||
|
|
||||||
for (const auto & field : sorted_fields)
|
|
||||||
{
|
|
||||||
if (field.tokens.empty())
|
|
||||||
throw Exception("Logical error in CapnProtoRowInputStream", ErrorCodes::LOGICAL_ERROR);
|
|
||||||
|
|
||||||
// Backtrack to common parent
|
|
||||||
while (field.tokens.size() < parent_tokens.size() + 1
|
|
||||||
|| !std::equal(parent_tokens.begin(), parent_tokens.end(), field.tokens.begin()))
|
|
||||||
{
|
|
||||||
actions.push_back({Action::POP});
|
|
||||||
parents.pop_back();
|
|
||||||
parent_tokens.pop_back();
|
|
||||||
|
|
||||||
if (parents.empty())
|
|
||||||
{
|
|
||||||
cur_reader = reader;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
cur_reader = parents.back().getType().asStruct();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Go forward
|
|
||||||
while (parent_tokens.size() + 1 < field.tokens.size())
|
|
||||||
{
|
|
||||||
const auto & token = field.tokens[parents.size()];
|
|
||||||
auto node = getFieldOrThrow(cur_reader, token);
|
|
||||||
if (node.getType().isStruct())
|
|
||||||
{
|
|
||||||
// Descend to field structure
|
|
||||||
parents.emplace_back(node);
|
|
||||||
parent_tokens.emplace_back(token);
|
|
||||||
cur_reader = node.getType().asStruct();
|
|
||||||
actions.push_back({Action::PUSH, node});
|
|
||||||
}
|
|
||||||
else if (node.getType().isList())
|
|
||||||
{
|
|
||||||
break; // Collect list
|
|
||||||
}
|
|
||||||
else
|
|
||||||
throw Exception("Field " + token + " is neither Struct nor List", ErrorCodes::BAD_TYPE_OF_FIELD);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Read field from the structure
|
|
||||||
auto node = getFieldOrThrow(cur_reader, field.tokens[parents.size()]);
|
|
||||||
if (node.getType().isList() && actions.size() > 0 && actions.back().field == node)
|
|
||||||
{
|
|
||||||
// The field list here flattens Nested elements into multiple arrays
|
|
||||||
// In order to map Nested types in Cap'nProto back, they need to be collected
|
|
||||||
// Since the field names are sorted, the order of field positions must be preserved
|
|
||||||
// For example, if the fields are { b @0 :Text, a @1 :Text }, the `a` would come first
|
|
||||||
// even though it's position is second.
|
|
||||||
auto & columns = actions.back().columns;
|
|
||||||
auto it = std::upper_bound(columns.cbegin(), columns.cend(), field.pos);
|
|
||||||
columns.insert(it, field.pos);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
actions.push_back({Action::READ, node, {field.pos}});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CapnProtoRowInputStream::CapnProtoRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSchemaInfo& info)
|
|
||||||
: istr(istr_), header(header_), parser(std::make_shared<SchemaParser>())
|
|
||||||
{
|
|
||||||
// Parse the schema and fetch the root object
|
|
||||||
|
|
||||||
#pragma GCC diagnostic push
|
|
||||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
|
||||||
auto schema = parser->impl.parseDiskFile(info.schemaPath(), info.absoluteSchemaPath(), {});
|
|
||||||
#pragma GCC diagnostic pop
|
|
||||||
|
|
||||||
root = schema.getNested(info.messageName()).asStruct();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The schema typically consists of fields in various nested structures.
|
|
||||||
* Here we gather the list of fields and sort them in a way so that fields in the same structure are adjacent,
|
|
||||||
* and the nesting level doesn't decrease to make traversal easier.
|
|
||||||
*/
|
|
||||||
NestedFieldList list;
|
|
||||||
size_t num_columns = header.columns();
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
list.push_back(split(header, i));
|
|
||||||
|
|
||||||
// Order list first by value of strings then by length of string vector.
|
|
||||||
std::sort(list.begin(), list.end(), [](const NestedField & a, const NestedField & b) { return a.tokens < b.tokens; });
|
|
||||||
createActions(list, root);
|
|
||||||
}
|
|
||||||
|
|
||||||
kj::Array<capnp::word> CapnProtoRowInputStream::readMessage()
|
|
||||||
{
|
|
||||||
uint32_t segment_count;
|
|
||||||
istr.readStrict(reinterpret_cast<char*>(&segment_count), sizeof(uint32_t));
|
|
||||||
|
|
||||||
// one for segmentCount and one because segmentCount starts from 0
|
|
||||||
const auto prefix_size = (2 + segment_count) * sizeof(uint32_t);
|
|
||||||
const auto words_prefix_size = (segment_count + 1) / 2 + 1;
|
|
||||||
auto prefix = kj::heapArray<capnp::word>(words_prefix_size);
|
|
||||||
auto prefix_chars = prefix.asChars();
|
|
||||||
::memcpy(prefix_chars.begin(), &segment_count, sizeof(uint32_t));
|
|
||||||
|
|
||||||
// read size of each segment
|
|
||||||
for (size_t i = 0; i <= segment_count; ++i)
|
|
||||||
istr.readStrict(prefix_chars.begin() + ((i + 1) * sizeof(uint32_t)), sizeof(uint32_t));
|
|
||||||
|
|
||||||
// calculate size of message
|
|
||||||
const auto expected_words = capnp::expectedSizeInWordsFromPrefix(prefix);
|
|
||||||
const auto expected_bytes = expected_words * sizeof(capnp::word);
|
|
||||||
const auto data_size = expected_bytes - prefix_size;
|
|
||||||
auto msg = kj::heapArray<capnp::word>(expected_words);
|
|
||||||
auto msg_chars = msg.asChars();
|
|
||||||
|
|
||||||
// read full message
|
|
||||||
::memcpy(msg_chars.begin(), prefix_chars.begin(), prefix_size);
|
|
||||||
istr.readStrict(msg_chars.begin() + prefix_size, data_size);
|
|
||||||
|
|
||||||
return msg;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CapnProtoRowInputStream::read(MutableColumns & columns, RowReadExtension &)
|
|
||||||
{
|
|
||||||
if (istr.eof())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
auto array = readMessage();
|
|
||||||
|
|
||||||
#if CAPNP_VERSION >= 8000
|
|
||||||
capnp::UnalignedFlatArrayMessageReader msg(array);
|
|
||||||
#else
|
|
||||||
capnp::FlatArrayMessageReader msg(array);
|
|
||||||
#endif
|
|
||||||
std::vector<capnp::DynamicStruct::Reader> stack;
|
|
||||||
stack.push_back(msg.getRoot<capnp::DynamicStruct>(root));
|
|
||||||
|
|
||||||
for (auto action : actions)
|
|
||||||
{
|
|
||||||
switch (action.type)
|
|
||||||
{
|
|
||||||
case Action::READ:
|
|
||||||
{
|
|
||||||
Field value = convertNodeToField(stack.back().get(action.field));
|
|
||||||
if (action.columns.size() > 1)
|
|
||||||
{
|
|
||||||
// Nested columns must be flattened into several arrays
|
|
||||||
// e.g. Array(Tuple(x ..., y ...)) -> Array(x ...), Array(y ...)
|
|
||||||
const Array & collected = DB::get<const Array &>(value);
|
|
||||||
size_t size = collected.size();
|
|
||||||
// The flattened array contains an array of a part of the nested tuple
|
|
||||||
Array flattened(size);
|
|
||||||
for (size_t column_index = 0; column_index < action.columns.size(); ++column_index)
|
|
||||||
{
|
|
||||||
// Populate array with a single tuple elements
|
|
||||||
for (size_t off = 0; off < size; ++off)
|
|
||||||
{
|
|
||||||
const TupleBackend & tuple = DB::get<const Tuple &>(collected[off]).toUnderType();
|
|
||||||
flattened[off] = tuple[column_index];
|
|
||||||
}
|
|
||||||
auto & col = columns[action.columns[column_index]];
|
|
||||||
col->insert(flattened);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
auto & col = columns[action.columns[0]];
|
|
||||||
col->insert(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case Action::POP:
|
|
||||||
stack.pop_back();
|
|
||||||
break;
|
|
||||||
case Action::PUSH:
|
|
||||||
stack.push_back(stack.back().get(action.field).as<capnp::DynamicStruct>());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void registerInputFormatCapnProto(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerInputFormat(
|
|
||||||
"CapnProto",
|
|
||||||
[](ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context & context,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<CapnProtoRowInputStream>(buf, sample, FormatSchemaInfo(context, "CapnProto")),
|
|
||||||
sample,
|
|
||||||
max_block_size,
|
|
||||||
rows_portion_size,
|
|
||||||
callback,
|
|
||||||
settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class FormatFactory;
|
|
||||||
void registerInputFormatCapnProto(FormatFactory &) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // USE_CAPNP
|
|
@ -1,76 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
#include "config_formats.h"
|
|
||||||
#if USE_CAPNP
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
#include <capnp/schema-parser.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class FormatSchemaInfo;
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
/** A stream for reading messages in Cap'n Proto format in given schema.
|
|
||||||
* Like Protocol Buffers and Thrift (but unlike JSON or MessagePack),
|
|
||||||
* Cap'n Proto messages are strongly-typed and not self-describing.
|
|
||||||
* The schema in this case cannot be compiled in, so it uses a runtime schema parser.
|
|
||||||
* See https://capnproto.org/cxx.html
|
|
||||||
*/
|
|
||||||
class CapnProtoRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
struct NestedField
|
|
||||||
{
|
|
||||||
std::vector<std::string> tokens;
|
|
||||||
size_t pos;
|
|
||||||
};
|
|
||||||
using NestedFieldList = std::vector<NestedField>;
|
|
||||||
|
|
||||||
/** schema_dir - base path for schema files
|
|
||||||
* schema_file - location of the capnproto schema, e.g. "schema.capnp"
|
|
||||||
* root_object - name to the root object, e.g. "Message"
|
|
||||||
*/
|
|
||||||
CapnProtoRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSchemaInfo & info);
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension &) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
kj::Array<capnp::word> readMessage();
|
|
||||||
|
|
||||||
// Build a traversal plan from a sorted list of fields
|
|
||||||
void createActions(const NestedFieldList & sortedFields, capnp::StructSchema reader);
|
|
||||||
|
|
||||||
/* Action for state machine for traversing nested structures. */
|
|
||||||
using BlockPositionList = std::vector<size_t>;
|
|
||||||
struct Action
|
|
||||||
{
|
|
||||||
enum Type { POP, PUSH, READ };
|
|
||||||
Type type{};
|
|
||||||
capnp::StructSchema::Field field{};
|
|
||||||
BlockPositionList columns{};
|
|
||||||
};
|
|
||||||
|
|
||||||
// Wrapper for classes that could throw in destructor
|
|
||||||
// https://github.com/capnproto/capnproto/issues/553
|
|
||||||
template <typename T>
|
|
||||||
struct DestructorCatcher
|
|
||||||
{
|
|
||||||
T impl;
|
|
||||||
template <typename ... Arg>
|
|
||||||
DestructorCatcher(Arg && ... args) : impl(kj::fwd<Arg>(args)...) {}
|
|
||||||
~DestructorCatcher() noexcept try { } catch (...) { return; }
|
|
||||||
};
|
|
||||||
using SchemaParser = DestructorCatcher<capnp::SchemaParser>;
|
|
||||||
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
std::shared_ptr<SchemaParser> parser;
|
|
||||||
capnp::StructSchema root;
|
|
||||||
std::vector<Action> actions;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // USE_CAPNP
|
|
@ -5,6 +5,10 @@
|
|||||||
#include <Formats/FormatSettings.h>
|
#include <Formats/FormatSettings.h>
|
||||||
#include <Formats/FormatFactory.h>
|
#include <Formats/FormatFactory.h>
|
||||||
#include <Processors/Formats/IRowInputFormat.h>
|
#include <Processors/Formats/IRowInputFormat.h>
|
||||||
|
#include <Processors/Formats/InputStreamFromInputFormat.h>
|
||||||
|
#include <Processors/Formats/OutputStreamToOutputFormat.h>
|
||||||
|
#include <DataStreams/SquashingBlockOutputStream.h>
|
||||||
|
#include <DataStreams/NativeBlockInputStream.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -18,7 +22,6 @@ namespace ErrorCodes
|
|||||||
extern const int FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT;
|
extern const int FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
|
const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
|
||||||
{
|
{
|
||||||
auto it = dict.find(name);
|
auto it = dict.find(name);
|
||||||
@ -27,13 +30,6 @@ const FormatFactory::Creators & FormatFactory::getCreators(const String & name)
|
|||||||
throw Exception("Unknown format " + name, ErrorCodes::UNKNOWN_FORMAT);
|
throw Exception("Unknown format " + name, ErrorCodes::UNKNOWN_FORMAT);
|
||||||
}
|
}
|
||||||
|
|
||||||
const FormatFactory::ProcessorCreators & FormatFactory::getProcessorCreators(const String & name) const
|
|
||||||
{
|
|
||||||
auto it = processors_dict.find(name);
|
|
||||||
if (processors_dict.end() != it)
|
|
||||||
return it->second;
|
|
||||||
throw Exception("Unknown format " + name, ErrorCodes::UNKNOWN_FORMAT);
|
|
||||||
}
|
|
||||||
|
|
||||||
static FormatSettings getInputFormatSetting(const Settings & settings)
|
static FormatSettings getInputFormatSetting(const Settings & settings)
|
||||||
{
|
{
|
||||||
@ -83,38 +79,54 @@ BlockInputStreamPtr FormatFactory::getInput(
|
|||||||
UInt64 rows_portion_size,
|
UInt64 rows_portion_size,
|
||||||
ReadCallback callback) const
|
ReadCallback callback) const
|
||||||
{
|
{
|
||||||
const auto & input_getter = getCreators(name).first;
|
if (name == "Native")
|
||||||
if (!input_getter)
|
return std::make_shared<NativeBlockInputStream>(buf, sample, 0);
|
||||||
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
|
|
||||||
|
|
||||||
const Settings & settings = context.getSettingsRef();
|
if (!getCreators(name).input_processor_creator)
|
||||||
FormatSettings format_settings = getInputFormatSetting(settings);
|
return getInput(name, buf, sample, context, max_block_size, rows_portion_size, std::move(callback));
|
||||||
|
|
||||||
return input_getter(
|
auto format = getInputFormat(name, buf, sample, context, max_block_size, rows_portion_size, std::move(callback));
|
||||||
buf, sample, context, max_block_size, rows_portion_size, callback ? callback : ReadCallback(), format_settings);
|
return std::make_shared<InputStreamFromInputFormat>(std::move(format));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BlockOutputStreamPtr FormatFactory::getOutput(const String & name, WriteBuffer & buf, const Block & sample, const Context & context) const
|
BlockOutputStreamPtr FormatFactory::getOutput(const String & name, WriteBuffer & buf, const Block & sample, const Context & context) const
|
||||||
{
|
{
|
||||||
const auto & output_getter = getCreators(name).second;
|
if (name == "PrettyCompactMonoBlock")
|
||||||
if (!output_getter)
|
{
|
||||||
throw Exception("Format " + name + " is not suitable for output", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
|
/// TODO: rewrite
|
||||||
|
auto format = getOutputFormat("PrettyCompact", buf, sample, context);
|
||||||
|
auto res = std::make_shared<SquashingBlockOutputStream>(
|
||||||
|
std::make_shared<OutputStreamToOutputFormat>(format),
|
||||||
|
sample, context.getSettingsRef().output_format_pretty_max_rows, 0);
|
||||||
|
|
||||||
const Settings & settings = context.getSettingsRef();
|
res->disableFlush();
|
||||||
FormatSettings format_settings = getOutputFormatSetting(settings);
|
|
||||||
|
return std::make_shared<MaterializingBlockOutputStream>(res, sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!getCreators(name).output_processor_creator)
|
||||||
|
return getOutput(name, buf, sample, context);
|
||||||
|
|
||||||
|
auto format = getOutputFormat(name, buf, sample, context);
|
||||||
|
|
||||||
/** Materialization is needed, because formats can use the functions `IDataType`,
|
/** Materialization is needed, because formats can use the functions `IDataType`,
|
||||||
* which only work with full columns.
|
* which only work with full columns.
|
||||||
*/
|
*/
|
||||||
return std::make_shared<MaterializingBlockOutputStream>(
|
return std::make_shared<MaterializingBlockOutputStream>(std::make_shared<OutputStreamToOutputFormat>(format), sample);
|
||||||
output_getter(buf, sample, context, format_settings), sample);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
InputFormatPtr FormatFactory::getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, const Context & context, UInt64 max_block_size) const
|
InputFormatPtr FormatFactory::getInputFormat(
|
||||||
|
const String & name,
|
||||||
|
ReadBuffer & buf,
|
||||||
|
const Block & sample,
|
||||||
|
const Context & context,
|
||||||
|
UInt64 max_block_size,
|
||||||
|
UInt64 rows_portion_size,
|
||||||
|
ReadCallback callback) const
|
||||||
{
|
{
|
||||||
const auto & input_getter = getProcessorCreators(name).first;
|
const auto & input_getter = getCreators(name).input_processor_creator;
|
||||||
if (!input_getter)
|
if (!input_getter)
|
||||||
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
|
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
|
||||||
|
|
||||||
@ -125,6 +137,10 @@ InputFormatPtr FormatFactory::getInputFormat(const String & name, ReadBuffer & b
|
|||||||
params.max_block_size = max_block_size;
|
params.max_block_size = max_block_size;
|
||||||
params.allow_errors_num = format_settings.input_allow_errors_num;
|
params.allow_errors_num = format_settings.input_allow_errors_num;
|
||||||
params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
|
params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
|
||||||
|
params.rows_portion_size = rows_portion_size;
|
||||||
|
params.callback = std::move(callback);
|
||||||
|
params.max_execution_time = settings.max_execution_time;
|
||||||
|
params.timeout_overflow_mode = settings.timeout_overflow_mode;
|
||||||
|
|
||||||
return input_getter(buf, sample, context, params, format_settings);
|
return input_getter(buf, sample, context, params, format_settings);
|
||||||
}
|
}
|
||||||
@ -132,7 +148,7 @@ InputFormatPtr FormatFactory::getInputFormat(const String & name, ReadBuffer & b
|
|||||||
|
|
||||||
OutputFormatPtr FormatFactory::getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample, const Context & context) const
|
OutputFormatPtr FormatFactory::getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample, const Context & context) const
|
||||||
{
|
{
|
||||||
const auto & output_getter = getProcessorCreators(name).second;
|
const auto & output_getter = getCreators(name).output_processor_creator;
|
||||||
if (!output_getter)
|
if (!output_getter)
|
||||||
throw Exception("Format " + name + " is not suitable for output", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
|
throw Exception("Format " + name + " is not suitable for output", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
|
||||||
|
|
||||||
@ -148,7 +164,7 @@ OutputFormatPtr FormatFactory::getOutputFormat(const String & name, WriteBuffer
|
|||||||
|
|
||||||
void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
|
void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
|
||||||
{
|
{
|
||||||
auto & target = dict[name].first;
|
auto & target = dict[name].inout_creator;
|
||||||
if (target)
|
if (target)
|
||||||
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
||||||
target = std::move(input_creator);
|
target = std::move(input_creator);
|
||||||
@ -156,7 +172,7 @@ void FormatFactory::registerInputFormat(const String & name, InputCreator input_
|
|||||||
|
|
||||||
void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator)
|
void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator)
|
||||||
{
|
{
|
||||||
auto & target = dict[name].second;
|
auto & target = dict[name].output_creator;
|
||||||
if (target)
|
if (target)
|
||||||
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
||||||
target = std::move(output_creator);
|
target = std::move(output_creator);
|
||||||
@ -164,7 +180,7 @@ void FormatFactory::registerOutputFormat(const String & name, OutputCreator outp
|
|||||||
|
|
||||||
void FormatFactory::registerInputFormatProcessor(const String & name, InputProcessorCreator input_creator)
|
void FormatFactory::registerInputFormatProcessor(const String & name, InputProcessorCreator input_creator)
|
||||||
{
|
{
|
||||||
auto & target = processors_dict[name].first;
|
auto & target = dict[name].input_processor_creator;
|
||||||
if (target)
|
if (target)
|
||||||
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
||||||
target = std::move(input_creator);
|
target = std::move(input_creator);
|
||||||
@ -172,7 +188,7 @@ void FormatFactory::registerInputFormatProcessor(const String & name, InputProce
|
|||||||
|
|
||||||
void FormatFactory::registerOutputFormatProcessor(const String & name, OutputProcessorCreator output_creator)
|
void FormatFactory::registerOutputFormatProcessor(const String & name, OutputProcessorCreator output_creator)
|
||||||
{
|
{
|
||||||
auto & target = processors_dict[name].second;
|
auto & target = dict[name].output_processor_creator;
|
||||||
if (target)
|
if (target)
|
||||||
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
||||||
target = std::move(output_creator);
|
target = std::move(output_creator);
|
||||||
@ -183,22 +199,8 @@ void FormatFactory::registerOutputFormatProcessor(const String & name, OutputPro
|
|||||||
|
|
||||||
void registerInputFormatNative(FormatFactory & factory);
|
void registerInputFormatNative(FormatFactory & factory);
|
||||||
void registerOutputFormatNative(FormatFactory & factory);
|
void registerOutputFormatNative(FormatFactory & factory);
|
||||||
void registerInputFormatRowBinary(FormatFactory & factory);
|
|
||||||
void registerOutputFormatRowBinary(FormatFactory & factory);
|
|
||||||
void registerInputFormatTabSeparated(FormatFactory & factory);
|
void registerInputFormatTabSeparated(FormatFactory & factory);
|
||||||
void registerOutputFormatTabSeparated(FormatFactory & factory);
|
|
||||||
void registerInputFormatValues(FormatFactory & factory);
|
|
||||||
void registerOutputFormatValues(FormatFactory & factory);
|
|
||||||
void registerInputFormatCSV(FormatFactory & factory);
|
void registerInputFormatCSV(FormatFactory & factory);
|
||||||
void registerOutputFormatCSV(FormatFactory & factory);
|
|
||||||
void registerInputFormatTSKV(FormatFactory & factory);
|
|
||||||
void registerOutputFormatTSKV(FormatFactory & factory);
|
|
||||||
void registerInputFormatJSONEachRow(FormatFactory & factory);
|
|
||||||
void registerOutputFormatJSONEachRow(FormatFactory & factory);
|
|
||||||
void registerInputFormatParquet(FormatFactory & factory);
|
|
||||||
void registerOutputFormatParquet(FormatFactory & factory);
|
|
||||||
void registerInputFormatProtobuf(FormatFactory & factory);
|
|
||||||
void registerOutputFormatProtobuf(FormatFactory & factory);
|
|
||||||
|
|
||||||
void registerInputFormatProcessorNative(FormatFactory & factory);
|
void registerInputFormatProcessorNative(FormatFactory & factory);
|
||||||
void registerOutputFormatProcessorNative(FormatFactory & factory);
|
void registerOutputFormatProcessorNative(FormatFactory & factory);
|
||||||
@ -221,17 +223,7 @@ void registerOutputFormatProcessorProtobuf(FormatFactory & factory);
|
|||||||
|
|
||||||
/// Output only (presentational) formats.
|
/// Output only (presentational) formats.
|
||||||
|
|
||||||
void registerOutputFormatPretty(FormatFactory & factory);
|
|
||||||
void registerOutputFormatPrettyCompact(FormatFactory & factory);
|
|
||||||
void registerOutputFormatPrettySpace(FormatFactory & factory);
|
|
||||||
void registerOutputFormatVertical(FormatFactory & factory);
|
|
||||||
void registerOutputFormatJSON(FormatFactory & factory);
|
|
||||||
void registerOutputFormatJSONCompact(FormatFactory & factory);
|
|
||||||
void registerOutputFormatXML(FormatFactory & factory);
|
|
||||||
void registerOutputFormatODBCDriver(FormatFactory & factory);
|
|
||||||
void registerOutputFormatODBCDriver2(FormatFactory & factory);
|
|
||||||
void registerOutputFormatNull(FormatFactory & factory);
|
void registerOutputFormatNull(FormatFactory & factory);
|
||||||
void registerOutputFormatMySQLWire(FormatFactory & factory);
|
|
||||||
|
|
||||||
void registerOutputFormatProcessorPretty(FormatFactory & factory);
|
void registerOutputFormatProcessorPretty(FormatFactory & factory);
|
||||||
void registerOutputFormatProcessorPrettyCompact(FormatFactory & factory);
|
void registerOutputFormatProcessorPrettyCompact(FormatFactory & factory);
|
||||||
@ -246,34 +238,14 @@ void registerOutputFormatProcessorNull(FormatFactory & factory);
|
|||||||
void registerOutputFormatProcessorMySQLWrite(FormatFactory & factory);
|
void registerOutputFormatProcessorMySQLWrite(FormatFactory & factory);
|
||||||
|
|
||||||
/// Input only formats.
|
/// Input only formats.
|
||||||
|
|
||||||
void registerInputFormatCapnProto(FormatFactory & factory);
|
|
||||||
void registerInputFormatProcessorCapnProto(FormatFactory & factory);
|
void registerInputFormatProcessorCapnProto(FormatFactory & factory);
|
||||||
|
|
||||||
|
|
||||||
FormatFactory::FormatFactory()
|
FormatFactory::FormatFactory()
|
||||||
{
|
{
|
||||||
registerInputFormatNative(*this);
|
registerInputFormatNative(*this);
|
||||||
registerOutputFormatNative(*this);
|
registerOutputFormatNative(*this);
|
||||||
registerInputFormatRowBinary(*this);
|
|
||||||
registerOutputFormatRowBinary(*this);
|
|
||||||
registerInputFormatTabSeparated(*this);
|
registerInputFormatTabSeparated(*this);
|
||||||
registerOutputFormatTabSeparated(*this);
|
|
||||||
registerInputFormatValues(*this);
|
|
||||||
registerOutputFormatValues(*this);
|
|
||||||
registerInputFormatCSV(*this);
|
registerInputFormatCSV(*this);
|
||||||
registerOutputFormatCSV(*this);
|
|
||||||
registerInputFormatTSKV(*this);
|
|
||||||
registerOutputFormatTSKV(*this);
|
|
||||||
registerInputFormatJSONEachRow(*this);
|
|
||||||
registerOutputFormatJSONEachRow(*this);
|
|
||||||
registerInputFormatProtobuf(*this);
|
|
||||||
registerOutputFormatProtobuf(*this);
|
|
||||||
registerInputFormatCapnProto(*this);
|
|
||||||
registerInputFormatParquet(*this);
|
|
||||||
registerOutputFormatParquet(*this);
|
|
||||||
|
|
||||||
registerOutputFormatMySQLWire(*this);
|
|
||||||
|
|
||||||
registerInputFormatProcessorNative(*this);
|
registerInputFormatProcessorNative(*this);
|
||||||
registerOutputFormatProcessorNative(*this);
|
registerOutputFormatProcessorNative(*this);
|
||||||
@ -295,15 +267,7 @@ FormatFactory::FormatFactory()
|
|||||||
registerInputFormatProcessorParquet(*this);
|
registerInputFormatProcessorParquet(*this);
|
||||||
registerOutputFormatProcessorParquet(*this);
|
registerOutputFormatProcessorParquet(*this);
|
||||||
|
|
||||||
registerOutputFormatPretty(*this);
|
|
||||||
registerOutputFormatPrettyCompact(*this);
|
|
||||||
registerOutputFormatPrettySpace(*this);
|
|
||||||
registerOutputFormatVertical(*this);
|
|
||||||
registerOutputFormatJSON(*this);
|
|
||||||
registerOutputFormatJSONCompact(*this);
|
|
||||||
registerOutputFormatXML(*this);
|
|
||||||
registerOutputFormatODBCDriver(*this);
|
|
||||||
registerOutputFormatODBCDriver2(*this);
|
|
||||||
registerOutputFormatNull(*this);
|
registerOutputFormatNull(*this);
|
||||||
|
|
||||||
registerOutputFormatProcessorPretty(*this);
|
registerOutputFormatProcessorPretty(*this);
|
||||||
|
@ -70,11 +70,15 @@ private:
|
|||||||
const Context & context,
|
const Context & context,
|
||||||
const FormatSettings & settings)>;
|
const FormatSettings & settings)>;
|
||||||
|
|
||||||
using Creators = std::pair<InputCreator, OutputCreator>;
|
struct Creators
|
||||||
using ProcessorCreators = std::pair<InputProcessorCreator, OutputProcessorCreator>;
|
{
|
||||||
|
InputCreator inout_creator;
|
||||||
|
OutputCreator output_creator;
|
||||||
|
InputProcessorCreator input_processor_creator;
|
||||||
|
OutputProcessorCreator output_processor_creator;
|
||||||
|
};
|
||||||
|
|
||||||
using FormatsDictionary = std::unordered_map<String, Creators>;
|
using FormatsDictionary = std::unordered_map<String, Creators>;
|
||||||
using FormatProcessorsDictionary = std::unordered_map<String, ProcessorCreators>;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
BlockInputStreamPtr getInput(
|
BlockInputStreamPtr getInput(
|
||||||
@ -89,8 +93,14 @@ public:
|
|||||||
BlockOutputStreamPtr getOutput(const String & name, WriteBuffer & buf,
|
BlockOutputStreamPtr getOutput(const String & name, WriteBuffer & buf,
|
||||||
const Block & sample, const Context & context) const;
|
const Block & sample, const Context & context) const;
|
||||||
|
|
||||||
InputFormatPtr getInputFormat(const String & name, ReadBuffer & buf,
|
InputFormatPtr getInputFormat(
|
||||||
const Block & sample, const Context & context, UInt64 max_block_size) const;
|
const String & name,
|
||||||
|
ReadBuffer & buf,
|
||||||
|
const Block & sample,
|
||||||
|
const Context & context,
|
||||||
|
UInt64 max_block_size,
|
||||||
|
UInt64 rows_portion_size = 0,
|
||||||
|
ReadCallback callback = {}) const;
|
||||||
|
|
||||||
OutputFormatPtr getOutputFormat(const String & name, WriteBuffer & buf,
|
OutputFormatPtr getOutputFormat(const String & name, WriteBuffer & buf,
|
||||||
const Block & sample, const Context & context) const;
|
const Block & sample, const Context & context) const;
|
||||||
@ -108,14 +118,13 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
/// FormatsDictionary dict;
|
||||||
FormatsDictionary dict;
|
FormatsDictionary dict;
|
||||||
FormatProcessorsDictionary processors_dict;
|
|
||||||
|
|
||||||
FormatFactory();
|
FormatFactory();
|
||||||
friend class ext::singleton<FormatFactory>;
|
friend class ext::singleton<FormatFactory>;
|
||||||
|
|
||||||
const Creators & getCreators(const String & name) const;
|
const Creators & getCreators(const String & name) const;
|
||||||
const ProcessorCreators & getProcessorCreators(const String & name) const;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,120 +0,0 @@
|
|||||||
#include <Formats/JSONCompactRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
JSONCompactRowOutputStream::JSONCompactRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & settings_)
|
|
||||||
: JSONRowOutputStream(ostr_, sample_, settings_)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONCompactRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
type.serializeAsTextJSON(column, row_num, *ostr, settings);
|
|
||||||
++field_number;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONCompactRowOutputStream::writeFieldDelimiter()
|
|
||||||
{
|
|
||||||
writeCString(", ", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONCompactRowOutputStream::writeRowStartDelimiter()
|
|
||||||
{
|
|
||||||
if (row_count > 0)
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeCString("\t\t[", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONCompactRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeChar(']', *ostr);
|
|
||||||
field_number = 0;
|
|
||||||
++row_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONCompactRowOutputStream::writeTotals()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
{
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t\"totals\": [", *ostr);
|
|
||||||
|
|
||||||
size_t totals_columns = totals.columns();
|
|
||||||
for (size_t i = 0; i < totals_columns; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeChar(',', *ostr);
|
|
||||||
|
|
||||||
const ColumnWithTypeAndName & column = totals.safeGetByPosition(i);
|
|
||||||
column.type->serializeAsTextJSON(*column.column.get(), 0, *ostr, settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeChar(']', *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void writeExtremesElement(const char * title, const Block & extremes, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
writeCString("\t\t\"", ostr);
|
|
||||||
writeCString(title, ostr);
|
|
||||||
writeCString("\": [", ostr);
|
|
||||||
|
|
||||||
size_t extremes_columns = extremes.columns();
|
|
||||||
for (size_t i = 0; i < extremes_columns; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeChar(',', ostr);
|
|
||||||
|
|
||||||
const ColumnWithTypeAndName & column = extremes.safeGetByPosition(i);
|
|
||||||
column.type->serializeAsTextJSON(*column.column.get(), row_num, ostr, settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeChar(']', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONCompactRowOutputStream::writeExtremes()
|
|
||||||
{
|
|
||||||
if (extremes)
|
|
||||||
{
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t\"extremes\":\n", *ostr);
|
|
||||||
writeCString("\t{\n", *ostr);
|
|
||||||
|
|
||||||
writeExtremesElement("min", extremes, 0, *ostr, settings);
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeExtremesElement("max", extremes, 1, *ostr, settings);
|
|
||||||
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t}", *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatJSONCompact(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("JSONCompact", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<JSONCompactRowOutputStream>(buf, sample, format_settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,31 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <IO/WriteBufferValidUTF8.h>
|
|
||||||
#include <Formats/JSONRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
struct FormatSettings;
|
|
||||||
|
|
||||||
/** The stream for outputting data in the JSONCompact format.
|
|
||||||
*/
|
|
||||||
class JSONCompactRowOutputStream : public JSONRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
JSONCompactRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeFieldDelimiter() override;
|
|
||||||
void writeRowStartDelimiter() override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void writeTotals() override;
|
|
||||||
void writeExtremes() override;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,272 +0,0 @@
|
|||||||
#include <IO/ReadHelpers.h>
|
|
||||||
|
|
||||||
#include <Formats/JSONEachRowRowInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
#include <DataTypes/NestedUtils.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int INCORRECT_DATA;
|
|
||||||
extern const int CANNOT_READ_ALL_DATA;
|
|
||||||
extern const int LOGICAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace
|
|
||||||
{
|
|
||||||
|
|
||||||
enum
|
|
||||||
{
|
|
||||||
UNKNOWN_FIELD = size_t(-1),
|
|
||||||
NESTED_FIELD = size_t(-2)
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
JSONEachRowRowInputStream::JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings)
|
|
||||||
: istr(istr_), header(header_), format_settings(format_settings), name_map(header.columns())
|
|
||||||
{
|
|
||||||
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
|
||||||
skipBOMIfExists(istr);
|
|
||||||
|
|
||||||
size_t num_columns = header.columns();
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
{
|
|
||||||
const String & colname = columnName(i);
|
|
||||||
name_map[colname] = i; /// NOTE You could place names more cache-locally.
|
|
||||||
if (format_settings.import_nested_json)
|
|
||||||
{
|
|
||||||
const auto splitted = Nested::splitName(colname);
|
|
||||||
if (!splitted.second.empty())
|
|
||||||
{
|
|
||||||
const StringRef table_name(colname.data(), splitted.first.size());
|
|
||||||
name_map[table_name] = NESTED_FIELD;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
prev_positions.assign(num_columns, name_map.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
const String & JSONEachRowRowInputStream::columnName(size_t i) const
|
|
||||||
{
|
|
||||||
return header.getByPosition(i).name;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline size_t JSONEachRowRowInputStream::columnIndex(const StringRef & name, size_t key_index)
|
|
||||||
{
|
|
||||||
/// Optimization by caching the order of fields (which is almost always the same)
|
|
||||||
/// and a quick check to match the next expected field, instead of searching the hash table.
|
|
||||||
|
|
||||||
if (prev_positions.size() > key_index
|
|
||||||
&& prev_positions[key_index] != name_map.end()
|
|
||||||
&& name == prev_positions[key_index]->getFirst())
|
|
||||||
{
|
|
||||||
return prev_positions[key_index]->getSecond();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
const auto it = name_map.find(name);
|
|
||||||
|
|
||||||
if (name_map.end() != it)
|
|
||||||
{
|
|
||||||
if (key_index < prev_positions.size())
|
|
||||||
prev_positions[key_index] = it;
|
|
||||||
|
|
||||||
return it->getSecond();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
return UNKNOWN_FIELD;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Read the field name and convert it to column name
|
|
||||||
* (taking into account the current nested name prefix)
|
|
||||||
* Resulting StringRef is valid only before next read from buf.
|
|
||||||
*/
|
|
||||||
StringRef JSONEachRowRowInputStream::readColumnName(ReadBuffer & buf)
|
|
||||||
{
|
|
||||||
// This is just an optimization: try to avoid copying the name into current_column_name
|
|
||||||
|
|
||||||
if (nested_prefix_length == 0 && buf.position() + 1 < buf.buffer().end())
|
|
||||||
{
|
|
||||||
char * next_pos = find_first_symbols<'\\', '"'>(buf.position() + 1, buf.buffer().end());
|
|
||||||
|
|
||||||
if (next_pos != buf.buffer().end() && *next_pos != '\\')
|
|
||||||
{
|
|
||||||
/// The most likely option is that there is no escape sequence in the key name, and the entire name is placed in the buffer.
|
|
||||||
assertChar('"', buf);
|
|
||||||
StringRef res(buf.position(), next_pos - buf.position());
|
|
||||||
buf.position() = next_pos + 1;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
current_column_name.resize(nested_prefix_length);
|
|
||||||
readJSONStringInto(current_column_name, buf);
|
|
||||||
return current_column_name;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static inline void skipColonDelimeter(ReadBuffer & istr)
|
|
||||||
{
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
assertChar(':', istr);
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONEachRowRowInputStream::skipUnknownField(const StringRef & name_ref)
|
|
||||||
{
|
|
||||||
if (!format_settings.skip_unknown_fields)
|
|
||||||
throw Exception("Unknown field found while parsing JSONEachRow format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
|
|
||||||
|
|
||||||
skipJSONField(istr, name_ref);
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONEachRowRowInputStream::readField(size_t index, MutableColumns & columns)
|
|
||||||
{
|
|
||||||
if (read_columns[index])
|
|
||||||
throw Exception("Duplicate field found while parsing JSONEachRow format: " + columnName(index), ErrorCodes::INCORRECT_DATA);
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
header.getByPosition(index).type->deserializeAsTextJSON(*columns[index], istr, format_settings);
|
|
||||||
}
|
|
||||||
catch (Exception & e)
|
|
||||||
{
|
|
||||||
e.addMessage("(while read the value of key " + columnName(index) + ")");
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
|
|
||||||
read_columns[index] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool JSONEachRowRowInputStream::advanceToNextKey(size_t key_index)
|
|
||||||
{
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
|
|
||||||
if (istr.eof())
|
|
||||||
throw Exception("Unexpected end of stream while parsing JSONEachRow format", ErrorCodes::CANNOT_READ_ALL_DATA);
|
|
||||||
else if (*istr.position() == '}')
|
|
||||||
{
|
|
||||||
++istr.position();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (key_index > 0)
|
|
||||||
{
|
|
||||||
assertChar(',', istr);
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONEachRowRowInputStream::readJSONObject(MutableColumns & columns)
|
|
||||||
{
|
|
||||||
assertChar('{', istr);
|
|
||||||
|
|
||||||
for (size_t key_index = 0; advanceToNextKey(key_index); ++key_index)
|
|
||||||
{
|
|
||||||
StringRef name_ref = readColumnName(istr);
|
|
||||||
const size_t column_index = columnIndex(name_ref, key_index);
|
|
||||||
|
|
||||||
if (unlikely(ssize_t(column_index) < 0))
|
|
||||||
{
|
|
||||||
/// name_ref may point directly to the input buffer
|
|
||||||
/// and input buffer may be filled with new data on next read
|
|
||||||
/// If we want to use name_ref after another reads from buffer, we must copy it to temporary string.
|
|
||||||
|
|
||||||
current_column_name.assign(name_ref.data, name_ref.size);
|
|
||||||
name_ref = StringRef(current_column_name);
|
|
||||||
|
|
||||||
skipColonDelimeter(istr);
|
|
||||||
|
|
||||||
if (column_index == UNKNOWN_FIELD)
|
|
||||||
skipUnknownField(name_ref);
|
|
||||||
else if (column_index == NESTED_FIELD)
|
|
||||||
readNestedData(name_ref.toString(), columns);
|
|
||||||
else
|
|
||||||
throw Exception("Logical error: illegal value of column_index", ErrorCodes::LOGICAL_ERROR);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
skipColonDelimeter(istr);
|
|
||||||
readField(column_index, columns);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONEachRowRowInputStream::readNestedData(const String & name, MutableColumns & columns)
|
|
||||||
{
|
|
||||||
current_column_name = name;
|
|
||||||
current_column_name.push_back('.');
|
|
||||||
nested_prefix_length = current_column_name.size();
|
|
||||||
readJSONObject(columns);
|
|
||||||
nested_prefix_length = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool JSONEachRowRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
|
|
||||||
{
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
|
|
||||||
/// We consume ;, or \n before scanning a new row, instead scanning to next row at the end.
|
|
||||||
/// The reason is that if we want an exact number of rows read with LIMIT x
|
|
||||||
/// from a streaming table engine with text data format, like File or Kafka
|
|
||||||
/// then seeking to next ;, or \n would trigger reading of an extra row at the end.
|
|
||||||
|
|
||||||
/// Semicolon is added for convenience as it could be used at end of INSERT query.
|
|
||||||
if (!istr.eof() && (*istr.position() == ',' || *istr.position() == ';'))
|
|
||||||
++istr.position();
|
|
||||||
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
if (istr.eof())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
size_t num_columns = columns.size();
|
|
||||||
|
|
||||||
/// Set of columns for which the values were read. The rest will be filled with default values.
|
|
||||||
read_columns.assign(num_columns, false);
|
|
||||||
|
|
||||||
nested_prefix_length = 0;
|
|
||||||
readJSONObject(columns);
|
|
||||||
|
|
||||||
/// Fill non-visited columns with the default values.
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
if (!read_columns[i])
|
|
||||||
header.getByPosition(i).type->insertDefaultInto(*columns[i]);
|
|
||||||
|
|
||||||
/// return info about defaults set
|
|
||||||
ext.read_columns = read_columns;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONEachRowRowInputStream::syncAfterError()
|
|
||||||
{
|
|
||||||
skipToUnescapedNextLineOrEOF(istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatJSONEachRow(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerInputFormat("JSONEachRow", [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<JSONEachRowRowInputStream>(buf, sample, settings),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,68 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Common/HashTable/HashMap.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A stream for reading data in JSON format, where each row is represented by a separate JSON object.
|
|
||||||
* Objects can be separated by line feed, other whitespace characters in any number and possibly a comma.
|
|
||||||
* Fields can be listed in any order (including, in different lines there may be different order),
|
|
||||||
* and some fields may be missing.
|
|
||||||
*/
|
|
||||||
class JSONEachRowRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
JSONEachRowRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension & ext) override;
|
|
||||||
bool allowSyncAfterError() const override { return true; }
|
|
||||||
void syncAfterError() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
const String & columnName(size_t i) const;
|
|
||||||
size_t columnIndex(const StringRef & name, size_t key_index);
|
|
||||||
bool advanceToNextKey(size_t key_index);
|
|
||||||
void skipUnknownField(const StringRef & name_ref);
|
|
||||||
StringRef readColumnName(ReadBuffer & buf);
|
|
||||||
void readField(size_t index, MutableColumns & columns);
|
|
||||||
void readJSONObject(MutableColumns & columns);
|
|
||||||
void readNestedData(const String & name, MutableColumns & columns);
|
|
||||||
|
|
||||||
private:
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
|
|
||||||
/// Buffer for the read from the stream field name. Used when you have to copy it.
|
|
||||||
/// Also, if processing of Nested data is in progress, it holds the common prefix
|
|
||||||
/// of the nested column names (so that appending the field name to it produces
|
|
||||||
/// the full column name)
|
|
||||||
String current_column_name;
|
|
||||||
|
|
||||||
/// If processing Nested data, holds the length of the common prefix
|
|
||||||
/// of the names of related nested columns. For example, for a table
|
|
||||||
/// created as follows
|
|
||||||
/// CREATE TABLE t (n Nested (i Int32, s String))
|
|
||||||
/// the nested column names are 'n.i' and 'n.s' and the nested prefix is 'n.'
|
|
||||||
size_t nested_prefix_length = 0;
|
|
||||||
|
|
||||||
std::vector<UInt8> read_columns;
|
|
||||||
|
|
||||||
/// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map.
|
|
||||||
using NameMap = HashMap<StringRef, size_t, StringRefHash>;
|
|
||||||
NameMap name_map;
|
|
||||||
|
|
||||||
/// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
|
|
||||||
std::vector<NameMap::iterator> prev_positions;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,67 +0,0 @@
|
|||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <IO/WriteBufferValidUTF8.h>
|
|
||||||
#include <Formats/JSONEachRowRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
|
|
||||||
JSONEachRowRowOutputStream::JSONEachRowRowOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings)
|
|
||||||
: ostr(ostr_), settings(settings)
|
|
||||||
{
|
|
||||||
size_t columns = sample.columns();
|
|
||||||
fields.resize(columns);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
WriteBufferFromString out(fields[i]);
|
|
||||||
writeJSONString(sample.getByPosition(i).name, out, settings);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONEachRowRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
writeString(fields[field_number], ostr);
|
|
||||||
writeChar(':', ostr);
|
|
||||||
type.serializeAsTextJSON(column, row_num, ostr, settings);
|
|
||||||
++field_number;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONEachRowRowOutputStream::writeFieldDelimiter()
|
|
||||||
{
|
|
||||||
writeChar(',', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONEachRowRowOutputStream::writeRowStartDelimiter()
|
|
||||||
{
|
|
||||||
writeChar('{', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONEachRowRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeCString("}\n", ostr);
|
|
||||||
field_number = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatJSONEachRow(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("JSONEachRow", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<JSONEachRowRowOutputStream>(buf, sample, format_settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,39 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** The stream for outputting data in JSON format, by object per line.
|
|
||||||
* Does not validate UTF-8.
|
|
||||||
*/
|
|
||||||
class JSONEachRowRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
JSONEachRowRowOutputStream(WriteBuffer & ostr_, const Block & sample, const FormatSettings & settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeFieldDelimiter() override;
|
|
||||||
void writeRowStartDelimiter() override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
|
|
||||||
void flush() override
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
size_t field_number = 0;
|
|
||||||
Names fields;
|
|
||||||
|
|
||||||
FormatSettings settings;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,246 +0,0 @@
|
|||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <IO/WriteBufferValidUTF8.h>
|
|
||||||
#include <Formats/JSONRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
JSONRowOutputStream::JSONRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & settings)
|
|
||||||
: dst_ostr(ostr_), settings(settings)
|
|
||||||
{
|
|
||||||
NamesAndTypesList columns(sample_.getNamesAndTypesList());
|
|
||||||
fields.assign(columns.begin(), columns.end());
|
|
||||||
|
|
||||||
bool need_validate_utf8 = false;
|
|
||||||
for (size_t i = 0; i < sample_.columns(); ++i)
|
|
||||||
{
|
|
||||||
if (!sample_.getByPosition(i).type->textCanContainOnlyValidUTF8())
|
|
||||||
need_validate_utf8 = true;
|
|
||||||
|
|
||||||
WriteBufferFromOwnString out;
|
|
||||||
writeJSONString(fields[i].name, out, settings);
|
|
||||||
|
|
||||||
fields[i].name = out.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (need_validate_utf8)
|
|
||||||
{
|
|
||||||
validating_ostr = std::make_unique<WriteBufferValidUTF8>(dst_ostr);
|
|
||||||
ostr = validating_ostr.get();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
ostr = &dst_ostr;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
writeCString("{\n", *ostr);
|
|
||||||
writeCString("\t\"meta\":\n", *ostr);
|
|
||||||
writeCString("\t[\n", *ostr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < fields.size(); ++i)
|
|
||||||
{
|
|
||||||
writeCString("\t\t{\n", *ostr);
|
|
||||||
|
|
||||||
writeCString("\t\t\t\"name\": ", *ostr);
|
|
||||||
writeString(fields[i].name, *ostr);
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeCString("\t\t\t\"type\": ", *ostr);
|
|
||||||
writeJSONString(fields[i].type->getName(), *ostr, settings);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
|
|
||||||
writeCString("\t\t}", *ostr);
|
|
||||||
if (i + 1 < fields.size())
|
|
||||||
writeChar(',', *ostr);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeCString("\t],\n", *ostr);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t\"data\":\n", *ostr);
|
|
||||||
writeCString("\t[\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
writeCString("\t\t\t", *ostr);
|
|
||||||
writeString(fields[field_number].name, *ostr);
|
|
||||||
writeCString(": ", *ostr);
|
|
||||||
type.serializeAsTextJSON(column, row_num, *ostr, settings);
|
|
||||||
++field_number;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeFieldDelimiter()
|
|
||||||
{
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeRowStartDelimiter()
|
|
||||||
{
|
|
||||||
if (row_count > 0)
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeCString("\t\t{\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t\t}", *ostr);
|
|
||||||
field_number = 0;
|
|
||||||
++row_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t]", *ostr);
|
|
||||||
|
|
||||||
writeTotals();
|
|
||||||
writeExtremes();
|
|
||||||
|
|
||||||
writeCString(",\n\n", *ostr);
|
|
||||||
writeCString("\t\"rows\": ", *ostr);
|
|
||||||
writeIntText(row_count, *ostr);
|
|
||||||
|
|
||||||
writeRowsBeforeLimitAtLeast();
|
|
||||||
|
|
||||||
if (settings.write_statistics)
|
|
||||||
writeStatistics();
|
|
||||||
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("}\n", *ostr);
|
|
||||||
ostr->next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeRowsBeforeLimitAtLeast()
|
|
||||||
{
|
|
||||||
if (applied_limit)
|
|
||||||
{
|
|
||||||
writeCString(",\n\n", *ostr);
|
|
||||||
writeCString("\t\"rows_before_limit_at_least\": ", *ostr);
|
|
||||||
writeIntText(rows_before_limit, *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeTotals()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
{
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t\"totals\":\n", *ostr);
|
|
||||||
writeCString("\t{\n", *ostr);
|
|
||||||
|
|
||||||
size_t totals_columns = totals.columns();
|
|
||||||
for (size_t i = 0; i < totals_columns; ++i)
|
|
||||||
{
|
|
||||||
const ColumnWithTypeAndName & column = totals.safeGetByPosition(i);
|
|
||||||
|
|
||||||
if (i != 0)
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
|
|
||||||
writeCString("\t\t", *ostr);
|
|
||||||
writeJSONString(column.name, *ostr, settings);
|
|
||||||
writeCString(": ", *ostr);
|
|
||||||
column.type->serializeAsTextJSON(*column.column.get(), 0, *ostr, settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t}", *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void writeExtremesElement(const char * title, const Block & extremes, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
writeCString("\t\t\"", ostr);
|
|
||||||
writeCString(title, ostr);
|
|
||||||
writeCString("\":\n", ostr);
|
|
||||||
writeCString("\t\t{\n", ostr);
|
|
||||||
|
|
||||||
size_t extremes_columns = extremes.columns();
|
|
||||||
for (size_t i = 0; i < extremes_columns; ++i)
|
|
||||||
{
|
|
||||||
const ColumnWithTypeAndName & column = extremes.safeGetByPosition(i);
|
|
||||||
|
|
||||||
if (i != 0)
|
|
||||||
writeCString(",\n", ostr);
|
|
||||||
|
|
||||||
writeCString("\t\t\t", ostr);
|
|
||||||
writeJSONString(column.name, ostr, settings);
|
|
||||||
writeCString(": ", ostr);
|
|
||||||
column.type->serializeAsTextJSON(*column.column.get(), row_num, ostr, settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
writeCString("\t\t}", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeExtremes()
|
|
||||||
{
|
|
||||||
if (extremes)
|
|
||||||
{
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t\"extremes\":\n", *ostr);
|
|
||||||
writeCString("\t{\n", *ostr);
|
|
||||||
|
|
||||||
writeExtremesElement("min", extremes, 0, *ostr, settings);
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeExtremesElement("max", extremes, 1, *ostr, settings);
|
|
||||||
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
writeCString("\t}", *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::onProgress(const Progress & value)
|
|
||||||
{
|
|
||||||
progress.incrementPiecewiseAtomically(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void JSONRowOutputStream::writeStatistics()
|
|
||||||
{
|
|
||||||
writeCString(",\n\n", *ostr);
|
|
||||||
writeCString("\t\"statistics\":\n", *ostr);
|
|
||||||
writeCString("\t{\n", *ostr);
|
|
||||||
|
|
||||||
writeCString("\t\t\"elapsed\": ", *ostr);
|
|
||||||
writeText(watch.elapsedSeconds(), *ostr);
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeCString("\t\t\"rows_read\": ", *ostr);
|
|
||||||
writeText(progress.read_rows.load(), *ostr);
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeCString("\t\t\"bytes_read\": ", *ostr);
|
|
||||||
writeText(progress.read_bytes.load(), *ostr);
|
|
||||||
writeChar('\n', *ostr);
|
|
||||||
|
|
||||||
writeCString("\t}", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatJSON(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("JSON", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<JSONRowOutputStream>(buf, sample, format_settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,74 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <IO/Progress.h>
|
|
||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <Common/Stopwatch.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** Stream for output data in JSON format.
|
|
||||||
*/
|
|
||||||
class JSONRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
JSONRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeFieldDelimiter() override;
|
|
||||||
void writeRowStartDelimiter() override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
void writePrefix() override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
|
|
||||||
void flush() override
|
|
||||||
{
|
|
||||||
ostr->next();
|
|
||||||
|
|
||||||
if (validating_ostr)
|
|
||||||
dst_ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void setRowsBeforeLimit(size_t rows_before_limit_) override
|
|
||||||
{
|
|
||||||
applied_limit = true;
|
|
||||||
rows_before_limit = rows_before_limit_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setTotals(const Block & totals_) override { totals = totals_; }
|
|
||||||
void setExtremes(const Block & extremes_) override { extremes = extremes_; }
|
|
||||||
|
|
||||||
void onProgress(const Progress & value) override;
|
|
||||||
|
|
||||||
String getContentType() const override { return "application/json; charset=UTF-8"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
|
|
||||||
void writeRowsBeforeLimitAtLeast();
|
|
||||||
virtual void writeTotals();
|
|
||||||
virtual void writeExtremes();
|
|
||||||
void writeStatistics();
|
|
||||||
|
|
||||||
WriteBuffer & dst_ostr;
|
|
||||||
std::unique_ptr<WriteBuffer> validating_ostr; /// Validates UTF-8 sequences, replaces bad sequences with replacement character.
|
|
||||||
WriteBuffer * ostr;
|
|
||||||
|
|
||||||
size_t field_number = 0;
|
|
||||||
size_t row_count = 0;
|
|
||||||
bool applied_limit = false;
|
|
||||||
size_t rows_before_limit = 0;
|
|
||||||
NamesAndTypes fields;
|
|
||||||
Block totals;
|
|
||||||
Block extremes;
|
|
||||||
|
|
||||||
Progress progress;
|
|
||||||
Stopwatch watch;
|
|
||||||
FormatSettings settings;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,85 +0,0 @@
|
|||||||
#include "MySQLWireBlockOutputStream.h"
|
|
||||||
#include <Core/MySQLProtocol.h>
|
|
||||||
#include <Interpreters/ProcessList.h>
|
|
||||||
#include <iomanip>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
using namespace MySQLProtocol;
|
|
||||||
|
|
||||||
MySQLWireBlockOutputStream::MySQLWireBlockOutputStream(WriteBuffer & buf, const Block & header, Context & context)
|
|
||||||
: header(header)
|
|
||||||
, context(context)
|
|
||||||
, packet_sender(buf, context.mysql.sequence_id)
|
|
||||||
{
|
|
||||||
packet_sender.max_packet_size = context.mysql.max_packet_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
void MySQLWireBlockOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
if (header.columns() == 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
packet_sender.sendPacket(LengthEncodedNumber(header.columns()));
|
|
||||||
|
|
||||||
for (const ColumnWithTypeAndName & column : header.getColumnsWithTypeAndName())
|
|
||||||
{
|
|
||||||
ColumnDefinition column_definition(column.name, CharacterSet::binary, 0, ColumnType::MYSQL_TYPE_STRING, 0, 0);
|
|
||||||
packet_sender.sendPacket(column_definition);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!(context.mysql.client_capabilities & Capability::CLIENT_DEPRECATE_EOF))
|
|
||||||
{
|
|
||||||
packet_sender.sendPacket(EOF_Packet(0, 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void MySQLWireBlockOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
size_t rows = block.rows();
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows; i++)
|
|
||||||
{
|
|
||||||
ResultsetRow row_packet;
|
|
||||||
for (const ColumnWithTypeAndName & column : block)
|
|
||||||
{
|
|
||||||
WriteBufferFromOwnString ostr;
|
|
||||||
column.type->serializeAsText(*column.column.get(), i, ostr, format_settings);
|
|
||||||
row_packet.appendColumn(std::move(ostr.str()));
|
|
||||||
}
|
|
||||||
packet_sender.sendPacket(row_packet);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void MySQLWireBlockOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
size_t affected_rows = 0;
|
|
||||||
std::stringstream human_readable_info;
|
|
||||||
if (QueryStatus * process_list_elem = context.getProcessListElement())
|
|
||||||
{
|
|
||||||
CurrentThread::finalizePerformanceCounters();
|
|
||||||
QueryStatusInfo info = process_list_elem->getInfo();
|
|
||||||
affected_rows = info.written_rows;
|
|
||||||
human_readable_info << std::fixed << std::setprecision(3)
|
|
||||||
<< "Read " << info.read_rows << " rows, " << formatReadableSizeWithBinarySuffix(info.read_bytes) << " in " << info.elapsed_seconds << " sec., "
|
|
||||||
<< static_cast<size_t>(info.read_rows / info.elapsed_seconds) << " rows/sec., "
|
|
||||||
<< formatReadableSizeWithBinarySuffix(info.read_bytes / info.elapsed_seconds) << "/sec.";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (header.columns() == 0)
|
|
||||||
packet_sender.sendPacket(OK_Packet(0x0, context.mysql.client_capabilities, affected_rows, 0, 0, "", human_readable_info.str()), true);
|
|
||||||
else
|
|
||||||
if (context.mysql.client_capabilities & CLIENT_DEPRECATE_EOF)
|
|
||||||
packet_sender.sendPacket(OK_Packet(0xfe, context.mysql.client_capabilities, affected_rows, 0, 0, "", human_readable_info.str()), true);
|
|
||||||
else
|
|
||||||
packet_sender.sendPacket(EOF_Packet(0, 0), true);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MySQLWireBlockOutputStream::flush()
|
|
||||||
{
|
|
||||||
packet_sender.out->next();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,36 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/MySQLProtocol.h>
|
|
||||||
#include <DataStreams/IBlockOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Interpreters/Context.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** Interface for writing rows in MySQL Client/Server Protocol format.
|
|
||||||
*/
|
|
||||||
class MySQLWireBlockOutputStream : public IBlockOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
MySQLWireBlockOutputStream(WriteBuffer & buf, const Block & header, Context & context);
|
|
||||||
|
|
||||||
Block getHeader() const { return header; }
|
|
||||||
|
|
||||||
void write(const Block & block);
|
|
||||||
|
|
||||||
void writePrefix();
|
|
||||||
void writeSuffix();
|
|
||||||
|
|
||||||
void flush();
|
|
||||||
private:
|
|
||||||
Block header;
|
|
||||||
Context & context;
|
|
||||||
MySQLProtocol::PacketSender packet_sender;
|
|
||||||
FormatSettings format_settings;
|
|
||||||
};
|
|
||||||
|
|
||||||
using MySQLWireBlockOutputStreamPtr = std::shared_ptr<MySQLWireBlockOutputStream>;
|
|
||||||
|
|
||||||
}
|
|
@ -1,19 +0,0 @@
|
|||||||
#include <Formats/MySQLWireBlockOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
void registerOutputFormatMySQLWire(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("MySQLWire", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context & context,
|
|
||||||
const FormatSettings &)
|
|
||||||
{
|
|
||||||
return std::make_shared<MySQLWireBlockOutputStream>(buf, sample, const_cast<Context &>(context));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,103 +0,0 @@
|
|||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/ODBCDriver2BlockOutputStream.h>
|
|
||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <DataTypes/DataTypeLowCardinality.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
ODBCDriver2BlockOutputStream::ODBCDriver2BlockOutputStream(
|
|
||||||
WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings)
|
|
||||||
: out(out_), header(header_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void ODBCDriver2BlockOutputStream::flush()
|
|
||||||
{
|
|
||||||
out.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void writeODBCString(WriteBuffer & out, const std::string & str)
|
|
||||||
{
|
|
||||||
writeIntBinary(Int32(str.size()), out);
|
|
||||||
out.write(str.data(), str.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
static void writeRow(const Block & block, size_t row_idx, WriteBuffer & out, const FormatSettings & format_settings, std::string & buffer)
|
|
||||||
{
|
|
||||||
size_t columns = block.columns();
|
|
||||||
for (size_t column_idx = 0; column_idx < columns; ++column_idx)
|
|
||||||
{
|
|
||||||
buffer.clear();
|
|
||||||
const ColumnWithTypeAndName & col = block.getByPosition(column_idx);
|
|
||||||
|
|
||||||
if (col.column->isNullAt(row_idx))
|
|
||||||
{
|
|
||||||
writeIntBinary(Int32(-1), out);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
{
|
|
||||||
WriteBufferFromString text_out(buffer);
|
|
||||||
col.type->serializeAsText(*col.column, row_idx, text_out, format_settings);
|
|
||||||
}
|
|
||||||
writeODBCString(out, buffer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ODBCDriver2BlockOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
String text_value;
|
|
||||||
const size_t rows = block.rows();
|
|
||||||
for (size_t i = 0; i < rows; ++i)
|
|
||||||
writeRow(block, i, out, format_settings, text_value);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ODBCDriver2BlockOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
write(totals);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ODBCDriver2BlockOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
const size_t columns = header.columns();
|
|
||||||
|
|
||||||
/// Number of header rows.
|
|
||||||
writeIntBinary(Int32(2), out);
|
|
||||||
|
|
||||||
/// Names of columns.
|
|
||||||
/// Number of columns + 1 for first name column.
|
|
||||||
writeIntBinary(Int32(columns + 1), out);
|
|
||||||
writeODBCString(out, "name");
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
const ColumnWithTypeAndName & col = header.getByPosition(i);
|
|
||||||
writeODBCString(out, col.name);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Types of columns.
|
|
||||||
writeIntBinary(Int32(columns + 1), out);
|
|
||||||
writeODBCString(out, "type");
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
auto type = header.getByPosition(i).type;
|
|
||||||
if (type->lowCardinality())
|
|
||||||
type = recursiveRemoveLowCardinality(type);
|
|
||||||
writeODBCString(out, type->getName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatODBCDriver2(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(
|
|
||||||
"ODBCDriver2", [](WriteBuffer & buf, const Block & sample, const Context &, const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<ODBCDriver2BlockOutputStream>(buf, sample, format_settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,51 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <DataStreams/IBlockOutputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class WriteBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A data format designed to simplify the implementation of the ODBC driver.
|
|
||||||
* ODBC driver is designed to be build for different platforms without dependencies from the main code,
|
|
||||||
* so the format is made that way so that it can be as easy as possible to parse it.
|
|
||||||
* A header is displayed with the required information.
|
|
||||||
* The data is then output in the order of the rows. Each value is displayed as follows: length in Int32 format (-1 for NULL), then data in text form.
|
|
||||||
*/
|
|
||||||
class ODBCDriver2BlockOutputStream final : public IBlockOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ODBCDriver2BlockOutputStream(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
Block getHeader() const override
|
|
||||||
{
|
|
||||||
return header;
|
|
||||||
}
|
|
||||||
void write(const Block & block) override;
|
|
||||||
void writePrefix() override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
|
|
||||||
void flush() override;
|
|
||||||
std::string getContentType() const override
|
|
||||||
{
|
|
||||||
return "application/octet-stream";
|
|
||||||
}
|
|
||||||
void setTotals(const Block & totals_) override { totals = totals_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
WriteBuffer & out;
|
|
||||||
const Block header;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
Block totals;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -1,74 +0,0 @@
|
|||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/ODBCDriverBlockOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
ODBCDriverBlockOutputStream::ODBCDriverBlockOutputStream(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings)
|
|
||||||
: out(out_), header(header_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void ODBCDriverBlockOutputStream::flush()
|
|
||||||
{
|
|
||||||
out.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void ODBCDriverBlockOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
const size_t rows = block.rows();
|
|
||||||
const size_t columns = block.columns();
|
|
||||||
String text_value;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows; ++i)
|
|
||||||
{
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
text_value.resize(0);
|
|
||||||
const ColumnWithTypeAndName & col = block.getByPosition(j);
|
|
||||||
|
|
||||||
{
|
|
||||||
WriteBufferFromString text_out(text_value);
|
|
||||||
col.type->serializeAsText(*col.column, i, text_out, format_settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeStringBinary(text_value, out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ODBCDriverBlockOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
const size_t columns = header.columns();
|
|
||||||
|
|
||||||
/// Number of columns.
|
|
||||||
writeVarUInt(columns, out);
|
|
||||||
|
|
||||||
/// Names and types of columns.
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
const ColumnWithTypeAndName & col = header.getByPosition(i);
|
|
||||||
|
|
||||||
writeStringBinary(col.name, out);
|
|
||||||
writeStringBinary(col.type->getName(), out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatODBCDriver(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("ODBCDriver", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<ODBCDriverBlockOutputStream>(buf, sample, format_settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,39 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <DataStreams/IBlockOutputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Core/Block.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class WriteBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A data format designed to simplify the implementation of the ODBC driver.
|
|
||||||
* ODBC driver is designed to be build for different platforms without dependencies from the main code,
|
|
||||||
* so the format is made that way so that it can be as easy as possible to parse it.
|
|
||||||
* A header is displayed with the required information.
|
|
||||||
* The data is then output in the order of the rows. Each value is displayed as follows: length in VarUInt format, then data in text form.
|
|
||||||
*/
|
|
||||||
class ODBCDriverBlockOutputStream : public IBlockOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ODBCDriverBlockOutputStream(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
Block getHeader() const override { return header; }
|
|
||||||
void write(const Block & block) override;
|
|
||||||
void writePrefix() override;
|
|
||||||
|
|
||||||
void flush() override;
|
|
||||||
std::string getContentType() const override { return "application/octet-stream"; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
WriteBuffer & out;
|
|
||||||
const Block header;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,494 +0,0 @@
|
|||||||
#include "ParquetBlockInputStream.h"
|
|
||||||
|
|
||||||
#if USE_PARQUET
|
|
||||||
# include <algorithm>
|
|
||||||
# include <iterator>
|
|
||||||
# include <vector>
|
|
||||||
// TODO: clear includes
|
|
||||||
# include <Columns/ColumnNullable.h>
|
|
||||||
# include <Columns/ColumnString.h>
|
|
||||||
# include <Columns/ColumnsNumber.h>
|
|
||||||
# include <Columns/IColumn.h>
|
|
||||||
# include <Core/ColumnWithTypeAndName.h>
|
|
||||||
# include <DataTypes/DataTypeDate.h>
|
|
||||||
# include <DataTypes/DataTypeDateTime.h>
|
|
||||||
# include <DataTypes/DataTypeFactory.h>
|
|
||||||
# include <DataTypes/DataTypeNullable.h>
|
|
||||||
# include <DataTypes/DataTypeString.h>
|
|
||||||
# include <DataTypes/DataTypesDecimal.h>
|
|
||||||
# include <DataTypes/DataTypesNumber.h>
|
|
||||||
# include <Formats/FormatFactory.h>
|
|
||||||
# include <IO/BufferBase.h>
|
|
||||||
# include <IO/ReadBufferFromMemory.h>
|
|
||||||
# include <IO/WriteBufferFromString.h>
|
|
||||||
# include <IO/WriteHelpers.h>
|
|
||||||
# include <IO/copyData.h>
|
|
||||||
# include <Interpreters/castColumn.h>
|
|
||||||
# include <arrow/api.h>
|
|
||||||
# include <parquet/arrow/reader.h>
|
|
||||||
# include <parquet/file_reader.h>
|
|
||||||
# include <common/DateLUTImpl.h>
|
|
||||||
# include <ext/range.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int UNKNOWN_TYPE;
|
|
||||||
extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE;
|
|
||||||
extern const int CANNOT_READ_ALL_DATA;
|
|
||||||
extern const int EMPTY_DATA_PASSED;
|
|
||||||
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
|
|
||||||
extern const int CANNOT_CONVERT_TYPE;
|
|
||||||
extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN;
|
|
||||||
extern const int THERE_IS_NO_COLUMN;
|
|
||||||
}
|
|
||||||
|
|
||||||
ParquetBlockInputStream::ParquetBlockInputStream(ReadBuffer & istr_, const Block & header_, const Context & context_)
|
|
||||||
: istr{istr_}, header{header_}, context{context_}
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
Block ParquetBlockInputStream::getHeader() const
|
|
||||||
{
|
|
||||||
return header;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Inserts numeric data right into internal column data to reduce an overhead
|
|
||||||
template <typename NumericType, typename VectorType = ColumnVector<NumericType>>
|
|
||||||
void fillColumnWithNumericData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
|
|
||||||
{
|
|
||||||
auto & column_data = static_cast<VectorType &>(*internal_column).getData();
|
|
||||||
column_data.reserve(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
std::shared_ptr<arrow::Array> chunk = arrow_column->data()->chunk(chunk_i);
|
|
||||||
/// buffers[0] is a null bitmap and buffers[1] are actual values
|
|
||||||
std::shared_ptr<arrow::Buffer> buffer = chunk->data()->buffers[1];
|
|
||||||
|
|
||||||
const auto * raw_data = reinterpret_cast<const NumericType *>(buffer->data());
|
|
||||||
column_data.insert_assume_reserved(raw_data, raw_data + chunk->length());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Inserts chars and offsets right into internal column data to reduce an overhead.
|
|
||||||
/// Internal offsets are shifted by one to the right in comparison with Arrow ones. So the last offset should map to the end of all chars.
|
|
||||||
/// Also internal strings are null terminated.
|
|
||||||
void fillColumnWithStringData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
|
|
||||||
{
|
|
||||||
PaddedPODArray<UInt8> & column_chars_t = static_cast<ColumnString &>(*internal_column).getChars();
|
|
||||||
PaddedPODArray<UInt64> & column_offsets = static_cast<ColumnString &>(*internal_column).getOffsets();
|
|
||||||
|
|
||||||
size_t chars_t_size = 0;
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
arrow::BinaryArray & chunk = static_cast<arrow::BinaryArray &>(*(arrow_column->data()->chunk(chunk_i)));
|
|
||||||
const size_t chunk_length = chunk.length();
|
|
||||||
|
|
||||||
chars_t_size += chunk.value_offset(chunk_length - 1) + chunk.value_length(chunk_length - 1);
|
|
||||||
chars_t_size += chunk_length; /// additional space for null bytes
|
|
||||||
}
|
|
||||||
|
|
||||||
column_chars_t.reserve(chars_t_size);
|
|
||||||
column_offsets.reserve(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
arrow::BinaryArray & chunk = static_cast<arrow::BinaryArray &>(*(arrow_column->data()->chunk(chunk_i)));
|
|
||||||
std::shared_ptr<arrow::Buffer> buffer = chunk.value_data();
|
|
||||||
const size_t chunk_length = chunk.length();
|
|
||||||
|
|
||||||
for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
|
|
||||||
{
|
|
||||||
if (!chunk.IsNull(offset_i) && buffer)
|
|
||||||
{
|
|
||||||
const UInt8 * raw_data = buffer->data() + chunk.value_offset(offset_i);
|
|
||||||
column_chars_t.insert_assume_reserved(raw_data, raw_data + chunk.value_length(offset_i));
|
|
||||||
}
|
|
||||||
column_chars_t.emplace_back('\0');
|
|
||||||
|
|
||||||
column_offsets.emplace_back(column_chars_t.size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void fillColumnWithBooleanData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
|
|
||||||
{
|
|
||||||
auto & column_data = static_cast<ColumnVector<UInt8> &>(*internal_column).getData();
|
|
||||||
column_data.resize(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
arrow::BooleanArray & chunk = static_cast<arrow::BooleanArray &>(*(arrow_column->data()->chunk(chunk_i)));
|
|
||||||
/// buffers[0] is a null bitmap and buffers[1] are actual values
|
|
||||||
std::shared_ptr<arrow::Buffer> buffer = chunk.data()->buffers[1];
|
|
||||||
|
|
||||||
for (size_t bool_i = 0; bool_i != static_cast<size_t>(chunk.length()); ++bool_i)
|
|
||||||
column_data[bool_i] = chunk.Value(bool_i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Arrow stores Parquet::DATE in Int32, while ClickHouse stores Date in UInt16. Therefore, it should be checked before saving
|
|
||||||
void fillColumnWithDate32Data(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
|
|
||||||
{
|
|
||||||
PaddedPODArray<UInt16> & column_data = static_cast<ColumnVector<UInt16> &>(*internal_column).getData();
|
|
||||||
column_data.reserve(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
arrow::Date32Array & chunk = static_cast<arrow::Date32Array &>(*(arrow_column->data()->chunk(chunk_i)));
|
|
||||||
|
|
||||||
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
|
|
||||||
{
|
|
||||||
UInt32 days_num = static_cast<UInt32>(chunk.Value(value_i));
|
|
||||||
if (days_num > DATE_LUT_MAX_DAY_NUM)
|
|
||||||
{
|
|
||||||
// TODO: will it rollback correctly?
|
|
||||||
throw Exception{"Input value " + std::to_string(days_num) + " of a column \"" + arrow_column->name()
|
|
||||||
+ "\" is greater than "
|
|
||||||
"max allowed Date value, which is "
|
|
||||||
+ std::to_string(DATE_LUT_MAX_DAY_NUM),
|
|
||||||
ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE};
|
|
||||||
}
|
|
||||||
|
|
||||||
column_data.emplace_back(days_num);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Arrow stores Parquet::DATETIME in Int64, while ClickHouse stores DateTime in UInt32. Therefore, it should be checked before saving
|
|
||||||
void fillColumnWithDate64Data(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
|
|
||||||
{
|
|
||||||
auto & column_data = static_cast<ColumnVector<UInt32> &>(*internal_column).getData();
|
|
||||||
column_data.reserve(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
auto & chunk = static_cast<arrow::Date64Array &>(*(arrow_column->data()->chunk(chunk_i)));
|
|
||||||
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
|
|
||||||
{
|
|
||||||
auto timestamp = static_cast<UInt32>(chunk.Value(value_i) / 1000); // Always? in ms
|
|
||||||
column_data.emplace_back(timestamp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void fillColumnWithTimestampData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
|
|
||||||
{
|
|
||||||
auto & column_data = static_cast<ColumnVector<UInt32> &>(*internal_column).getData();
|
|
||||||
column_data.reserve(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
auto & chunk = static_cast<arrow::TimestampArray &>(*(arrow_column->data()->chunk(chunk_i)));
|
|
||||||
const auto & type = static_cast<const ::arrow::TimestampType &>(*chunk.type());
|
|
||||||
|
|
||||||
UInt32 divide = 1;
|
|
||||||
const auto unit = type.unit();
|
|
||||||
switch (unit)
|
|
||||||
{
|
|
||||||
case arrow::TimeUnit::SECOND:
|
|
||||||
divide = 1;
|
|
||||||
break;
|
|
||||||
case arrow::TimeUnit::MILLI:
|
|
||||||
divide = 1000;
|
|
||||||
break;
|
|
||||||
case arrow::TimeUnit::MICRO:
|
|
||||||
divide = 1000000;
|
|
||||||
break;
|
|
||||||
case arrow::TimeUnit::NANO:
|
|
||||||
divide = 1000000000;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
|
|
||||||
{
|
|
||||||
auto timestamp = static_cast<UInt32>(chunk.Value(value_i) / divide); // ms! TODO: check other 's' 'ns' ...
|
|
||||||
column_data.emplace_back(timestamp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void fillColumnWithDecimalData(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & internal_column)
|
|
||||||
{
|
|
||||||
auto & column = static_cast<ColumnDecimal<Decimal128> &>(*internal_column);
|
|
||||||
auto & column_data = column.getData();
|
|
||||||
column_data.reserve(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0, num_chunks = static_cast<size_t>(arrow_column->data()->num_chunks()); chunk_i < num_chunks; ++chunk_i)
|
|
||||||
{
|
|
||||||
auto & chunk = static_cast<arrow::DecimalArray &>(*(arrow_column->data()->chunk(chunk_i)));
|
|
||||||
for (size_t value_i = 0, length = static_cast<size_t>(chunk.length()); value_i < length; ++value_i)
|
|
||||||
{
|
|
||||||
column_data.emplace_back(
|
|
||||||
chunk.IsNull(value_i) ? Decimal128(0) : *reinterpret_cast<const Decimal128 *>(chunk.Value(value_i))); // TODO: copy column
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a null bytemap from arrow's null bitmap
|
|
||||||
void fillByteMapFromArrowColumn(std::shared_ptr<arrow::Column> & arrow_column, MutableColumnPtr & bytemap)
|
|
||||||
{
|
|
||||||
PaddedPODArray<UInt8> & bytemap_data = static_cast<ColumnVector<UInt8> &>(*bytemap).getData();
|
|
||||||
bytemap_data.reserve(arrow_column->length());
|
|
||||||
|
|
||||||
for (size_t chunk_i = 0; chunk_i != static_cast<size_t>(arrow_column->data()->num_chunks()); ++chunk_i)
|
|
||||||
{
|
|
||||||
std::shared_ptr<arrow::Array> chunk = arrow_column->data()->chunk(chunk_i);
|
|
||||||
|
|
||||||
for (size_t value_i = 0; value_i != static_cast<size_t>(chunk->length()); ++value_i)
|
|
||||||
bytemap_data.emplace_back(chunk->IsNull(value_i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# define FOR_ARROW_NUMERIC_TYPES(M) \
|
|
||||||
M(arrow::Type::UINT8, UInt8) \
|
|
||||||
M(arrow::Type::INT8, Int8) \
|
|
||||||
M(arrow::Type::UINT16, UInt16) \
|
|
||||||
M(arrow::Type::INT16, Int16) \
|
|
||||||
M(arrow::Type::UINT32, UInt32) \
|
|
||||||
M(arrow::Type::INT32, Int32) \
|
|
||||||
M(arrow::Type::UINT64, UInt64) \
|
|
||||||
M(arrow::Type::INT64, Int64) \
|
|
||||||
M(arrow::Type::FLOAT, Float32) \
|
|
||||||
M(arrow::Type::DOUBLE, Float64)
|
|
||||||
//M(arrow::Type::HALF_FLOAT, Float32) // TODO
|
|
||||||
|
|
||||||
|
|
||||||
using NameToColumnPtr = std::unordered_map<std::string, std::shared_ptr<arrow::Column>>;
|
|
||||||
|
|
||||||
|
|
||||||
Block ParquetBlockInputStream::readImpl()
|
|
||||||
{
|
|
||||||
static const std::unordered_map<arrow::Type::type, std::shared_ptr<IDataType>> arrow_type_to_internal_type = {
|
|
||||||
//{arrow::Type::DECIMAL, std::make_shared<DataTypeDecimal>()},
|
|
||||||
{arrow::Type::UINT8, std::make_shared<DataTypeUInt8>()},
|
|
||||||
{arrow::Type::INT8, std::make_shared<DataTypeInt8>()},
|
|
||||||
{arrow::Type::UINT16, std::make_shared<DataTypeUInt16>()},
|
|
||||||
{arrow::Type::INT16, std::make_shared<DataTypeInt16>()},
|
|
||||||
{arrow::Type::UINT32, std::make_shared<DataTypeUInt32>()},
|
|
||||||
{arrow::Type::INT32, std::make_shared<DataTypeInt32>()},
|
|
||||||
{arrow::Type::UINT64, std::make_shared<DataTypeUInt64>()},
|
|
||||||
{arrow::Type::INT64, std::make_shared<DataTypeInt64>()},
|
|
||||||
{arrow::Type::HALF_FLOAT, std::make_shared<DataTypeFloat32>()},
|
|
||||||
{arrow::Type::FLOAT, std::make_shared<DataTypeFloat32>()},
|
|
||||||
{arrow::Type::DOUBLE, std::make_shared<DataTypeFloat64>()},
|
|
||||||
|
|
||||||
{arrow::Type::BOOL, std::make_shared<DataTypeUInt8>()},
|
|
||||||
//{arrow::Type::DATE32, std::make_shared<DataTypeDate>()},
|
|
||||||
{arrow::Type::DATE32, std::make_shared<DataTypeDate>()},
|
|
||||||
//{arrow::Type::DATE32, std::make_shared<DataTypeDateTime>()},
|
|
||||||
{arrow::Type::DATE64, std::make_shared<DataTypeDateTime>()},
|
|
||||||
{arrow::Type::TIMESTAMP, std::make_shared<DataTypeDateTime>()},
|
|
||||||
//{arrow::Type::TIME32, std::make_shared<DataTypeDateTime>()},
|
|
||||||
|
|
||||||
|
|
||||||
{arrow::Type::STRING, std::make_shared<DataTypeString>()},
|
|
||||||
{arrow::Type::BINARY, std::make_shared<DataTypeString>()},
|
|
||||||
//{arrow::Type::FIXED_SIZE_BINARY, std::make_shared<DataTypeString>()},
|
|
||||||
//{arrow::Type::UUID, std::make_shared<DataTypeString>()},
|
|
||||||
|
|
||||||
|
|
||||||
// TODO: add other types that are convertable to internal ones:
|
|
||||||
// 0. ENUM?
|
|
||||||
// 1. UUID -> String
|
|
||||||
// 2. JSON -> String
|
|
||||||
// Full list of types: contrib/arrow/cpp/src/arrow/type.h
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
Block res;
|
|
||||||
|
|
||||||
if (!istr.eof())
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
First we load whole stream into string (its very bad and limiting .parquet file size to half? of RAM)
|
|
||||||
Then producing blocks for every row_group (dont load big .parquet files with one row_group - it can eat x10+ RAM from .parquet file size)
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (row_group_current < row_group_total)
|
|
||||||
throw Exception{"Got new data, but data from previous chunks not readed " + std::to_string(row_group_current) + "/"
|
|
||||||
+ std::to_string(row_group_total),
|
|
||||||
ErrorCodes::CANNOT_READ_ALL_DATA};
|
|
||||||
|
|
||||||
file_data.clear();
|
|
||||||
{
|
|
||||||
WriteBufferFromString file_buffer(file_data);
|
|
||||||
copyData(istr, file_buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
buffer = std::make_unique<arrow::Buffer>(file_data);
|
|
||||||
// TODO: maybe use parquet::RandomAccessSource?
|
|
||||||
auto reader = parquet::ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(*buffer));
|
|
||||||
file_reader = std::make_unique<parquet::arrow::FileReader>(::arrow::default_memory_pool(), std::move(reader));
|
|
||||||
row_group_total = file_reader->num_row_groups();
|
|
||||||
row_group_current = 0;
|
|
||||||
}
|
|
||||||
if (row_group_current >= row_group_total)
|
|
||||||
return res;
|
|
||||||
|
|
||||||
// TODO: also catch a ParquetException thrown by filereader?
|
|
||||||
//arrow::Status read_status = filereader.ReadTable(&table);
|
|
||||||
std::shared_ptr<arrow::Table> table;
|
|
||||||
arrow::Status read_status = file_reader->ReadRowGroup(row_group_current, &table);
|
|
||||||
|
|
||||||
if (!read_status.ok())
|
|
||||||
throw Exception{"Error while reading parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA};
|
|
||||||
|
|
||||||
if (0 == table->num_rows())
|
|
||||||
throw Exception{"Empty table in input data", ErrorCodes::EMPTY_DATA_PASSED};
|
|
||||||
|
|
||||||
if (header.columns() > static_cast<size_t>(table->num_columns()))
|
|
||||||
// TODO: What if some columns were not presented? Insert NULLs? What if a column is not nullable?
|
|
||||||
throw Exception{"Number of columns is less than the table has", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH};
|
|
||||||
|
|
||||||
++row_group_current;
|
|
||||||
|
|
||||||
NameToColumnPtr name_to_column_ptr;
|
|
||||||
for (size_t i = 0, num_columns = static_cast<size_t>(table->num_columns()); i < num_columns; ++i)
|
|
||||||
{
|
|
||||||
std::shared_ptr<arrow::Column> arrow_column = table->column(i);
|
|
||||||
name_to_column_ptr[arrow_column->name()] = arrow_column;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t column_i = 0, columns = header.columns(); column_i < columns; ++column_i)
|
|
||||||
{
|
|
||||||
ColumnWithTypeAndName header_column = header.getByPosition(column_i);
|
|
||||||
|
|
||||||
if (name_to_column_ptr.find(header_column.name) == name_to_column_ptr.end())
|
|
||||||
// TODO: What if some columns were not presented? Insert NULLs? What if a column is not nullable?
|
|
||||||
throw Exception{"Column \"" + header_column.name + "\" is not presented in input data", ErrorCodes::THERE_IS_NO_COLUMN};
|
|
||||||
|
|
||||||
std::shared_ptr<arrow::Column> arrow_column = name_to_column_ptr[header_column.name];
|
|
||||||
arrow::Type::type arrow_type = arrow_column->type()->id();
|
|
||||||
|
|
||||||
// TODO: check if a column is const?
|
|
||||||
if (!header_column.type->isNullable() && arrow_column->null_count())
|
|
||||||
{
|
|
||||||
throw Exception{"Can not insert NULL data into non-nullable column \"" + header_column.name + "\"",
|
|
||||||
ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN};
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool target_column_is_nullable = header_column.type->isNullable() || arrow_column->null_count();
|
|
||||||
|
|
||||||
DataTypePtr internal_nested_type;
|
|
||||||
|
|
||||||
if (arrow_type == arrow::Type::DECIMAL)
|
|
||||||
{
|
|
||||||
const auto decimal_type = static_cast<arrow::DecimalType *>(arrow_column->type().get());
|
|
||||||
internal_nested_type = std::make_shared<DataTypeDecimal<Decimal128>>(decimal_type->precision(), decimal_type->scale());
|
|
||||||
}
|
|
||||||
else if (arrow_type_to_internal_type.find(arrow_type) != arrow_type_to_internal_type.end())
|
|
||||||
{
|
|
||||||
internal_nested_type = arrow_type_to_internal_type.at(arrow_type);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
throw Exception{"The type \"" + arrow_column->type()->name() + "\" of an input column \"" + arrow_column->name()
|
|
||||||
+ "\" is not supported for conversion from a Parquet data format",
|
|
||||||
ErrorCodes::CANNOT_CONVERT_TYPE};
|
|
||||||
}
|
|
||||||
|
|
||||||
const DataTypePtr internal_type = target_column_is_nullable ? makeNullable(internal_nested_type) : internal_nested_type;
|
|
||||||
const std::string internal_nested_type_name = internal_nested_type->getName();
|
|
||||||
|
|
||||||
const DataTypePtr column_nested_type = header_column.type->isNullable()
|
|
||||||
? static_cast<const DataTypeNullable *>(header_column.type.get())->getNestedType()
|
|
||||||
: header_column.type;
|
|
||||||
|
|
||||||
const DataTypePtr column_type = header_column.type;
|
|
||||||
|
|
||||||
const std::string column_nested_type_name = column_nested_type->getName();
|
|
||||||
|
|
||||||
ColumnWithTypeAndName column;
|
|
||||||
column.name = header_column.name;
|
|
||||||
column.type = internal_type;
|
|
||||||
|
|
||||||
/// Data
|
|
||||||
MutableColumnPtr read_column = internal_nested_type->createColumn();
|
|
||||||
switch (arrow_type)
|
|
||||||
{
|
|
||||||
case arrow::Type::STRING:
|
|
||||||
case arrow::Type::BINARY:
|
|
||||||
//case arrow::Type::FIXED_SIZE_BINARY:
|
|
||||||
fillColumnWithStringData(arrow_column, read_column);
|
|
||||||
break;
|
|
||||||
case arrow::Type::BOOL:
|
|
||||||
fillColumnWithBooleanData(arrow_column, read_column);
|
|
||||||
break;
|
|
||||||
case arrow::Type::DATE32:
|
|
||||||
fillColumnWithDate32Data(arrow_column, read_column);
|
|
||||||
break;
|
|
||||||
case arrow::Type::DATE64:
|
|
||||||
fillColumnWithDate64Data(arrow_column, read_column);
|
|
||||||
break;
|
|
||||||
case arrow::Type::TIMESTAMP:
|
|
||||||
fillColumnWithTimestampData(arrow_column, read_column);
|
|
||||||
break;
|
|
||||||
case arrow::Type::DECIMAL:
|
|
||||||
//fillColumnWithNumericData<Decimal128, ColumnDecimal<Decimal128>>(arrow_column, read_column); // Have problems with trash values under NULL, but faster
|
|
||||||
fillColumnWithDecimalData(arrow_column, read_column /*, internal_nested_type*/);
|
|
||||||
break;
|
|
||||||
# define DISPATCH(ARROW_NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
|
|
||||||
case ARROW_NUMERIC_TYPE: \
|
|
||||||
fillColumnWithNumericData<CPP_NUMERIC_TYPE>(arrow_column, read_column); \
|
|
||||||
break;
|
|
||||||
|
|
||||||
FOR_ARROW_NUMERIC_TYPES(DISPATCH)
|
|
||||||
# undef DISPATCH
|
|
||||||
// TODO: support TIMESTAMP_MICROS and TIMESTAMP_MILLIS with truncated micro- and milliseconds?
|
|
||||||
// TODO: read JSON as a string?
|
|
||||||
// TODO: read UUID as a string?
|
|
||||||
default:
|
|
||||||
throw Exception{"Unsupported parquet type \"" + arrow_column->type()->name() + "\" of an input column \""
|
|
||||||
+ arrow_column->name() + "\"",
|
|
||||||
ErrorCodes::UNKNOWN_TYPE};
|
|
||||||
}
|
|
||||||
|
|
||||||
if (column.type->isNullable())
|
|
||||||
{
|
|
||||||
MutableColumnPtr null_bytemap = DataTypeUInt8().createColumn();
|
|
||||||
fillByteMapFromArrowColumn(arrow_column, null_bytemap);
|
|
||||||
column.column = ColumnNullable::create(std::move(read_column), std::move(null_bytemap));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
column.column = std::move(read_column);
|
|
||||||
}
|
|
||||||
|
|
||||||
column.column = castColumn(column, column_type, context);
|
|
||||||
column.type = column_type;
|
|
||||||
|
|
||||||
res.insert(std::move(column));
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
void registerInputFormatParquet(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerInputFormat(
|
|
||||||
"Parquet",
|
|
||||||
[](ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context & context,
|
|
||||||
UInt64 /* max_block_size */,
|
|
||||||
UInt64 /* rows_portion_size */,
|
|
||||||
FormatFactory::ReadCallback /* callback */,
|
|
||||||
const FormatSettings & /* settings */) { return std::make_shared<ParquetBlockInputStream>(buf, sample, context); });
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class FormatFactory;
|
|
||||||
void registerInputFormatParquet(FormatFactory &)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,43 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "config_formats.h"
|
|
||||||
#if USE_PARQUET
|
|
||||||
# include <DataStreams/IBlockInputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace parquet { namespace arrow { class FileReader; } }
|
|
||||||
namespace arrow { class Buffer; }
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class Context;
|
|
||||||
|
|
||||||
class ParquetBlockInputStream : public IBlockInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ParquetBlockInputStream(ReadBuffer & istr_, const Block & header_, const Context & context_);
|
|
||||||
|
|
||||||
String getName() const override { return "Parquet"; }
|
|
||||||
Block getHeader() const override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
Block readImpl() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
|
|
||||||
// TODO: check that this class implements every part of its parent
|
|
||||||
|
|
||||||
const Context & context;
|
|
||||||
|
|
||||||
std::unique_ptr<parquet::arrow::FileReader> file_reader;
|
|
||||||
std::string file_data;
|
|
||||||
std::unique_ptr<arrow::Buffer> buffer;
|
|
||||||
int row_group_total = 0;
|
|
||||||
int row_group_current = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,450 +0,0 @@
|
|||||||
#include "ParquetBlockOutputStream.h"
|
|
||||||
|
|
||||||
#if USE_PARQUET
|
|
||||||
// TODO: clean includes
|
|
||||||
# include <Columns/ColumnDecimal.h>
|
|
||||||
# include <Columns/ColumnFixedString.h>
|
|
||||||
# include <Columns/ColumnNullable.h>
|
|
||||||
# include <Columns/ColumnString.h>
|
|
||||||
# include <Columns/ColumnVector.h>
|
|
||||||
# include <Columns/ColumnsNumber.h>
|
|
||||||
# include <Core/ColumnWithTypeAndName.h>
|
|
||||||
# include <Core/callOnTypeIndex.h>
|
|
||||||
# include <DataTypes/DataTypeDateTime.h>
|
|
||||||
# include <DataTypes/DataTypeNullable.h>
|
|
||||||
# include <DataTypes/DataTypesDecimal.h>
|
|
||||||
# include <DataStreams/SquashingBlockOutputStream.h>
|
|
||||||
# include <Formats/FormatFactory.h>
|
|
||||||
# include <IO/WriteHelpers.h>
|
|
||||||
# include <arrow/api.h>
|
|
||||||
# include <arrow/io/api.h>
|
|
||||||
# include <arrow/util/decimal.h>
|
|
||||||
# include <parquet/arrow/writer.h>
|
|
||||||
# include <parquet/exception.h>
|
|
||||||
# include <parquet/util/memory.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int UNKNOWN_EXCEPTION;
|
|
||||||
extern const int UNKNOWN_TYPE;
|
|
||||||
}
|
|
||||||
|
|
||||||
ParquetBlockOutputStream::ParquetBlockOutputStream(WriteBuffer & ostr, const Block & header, const FormatSettings & format_settings) : ostr{ostr}, header{header}, format_settings{format_settings}
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void ParquetBlockOutputStream::flush()
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void checkStatus(arrow::Status & status, const std::string & column_name)
|
|
||||||
{
|
|
||||||
if (!status.ok())
|
|
||||||
throw Exception{"Error with a parquet column \"" + column_name + "\": " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename NumericType, typename ArrowBuilderType>
|
|
||||||
void fillArrowArrayWithNumericColumnData(
|
|
||||||
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
|
|
||||||
{
|
|
||||||
const PaddedPODArray<NumericType> & internal_data = static_cast<const ColumnVector<NumericType> &>(*write_column).getData();
|
|
||||||
ArrowBuilderType builder;
|
|
||||||
arrow::Status status;
|
|
||||||
|
|
||||||
const UInt8 * arrow_null_bytemap_raw_ptr = nullptr;
|
|
||||||
PaddedPODArray<UInt8> arrow_null_bytemap;
|
|
||||||
if (null_bytemap)
|
|
||||||
{
|
|
||||||
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
|
|
||||||
arrow_null_bytemap.reserve(null_bytemap->size());
|
|
||||||
for (size_t i = 0, size = null_bytemap->size(); i < size; ++i)
|
|
||||||
arrow_null_bytemap.emplace_back(1 ^ (*null_bytemap)[i]);
|
|
||||||
|
|
||||||
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
status = builder.AppendValues(internal_data.data(), internal_data.size(), arrow_null_bytemap_raw_ptr);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
|
|
||||||
status = builder.Finish(&arrow_array);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename ColumnType>
|
|
||||||
void fillArrowArrayWithStringColumnData(
|
|
||||||
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
|
|
||||||
{
|
|
||||||
const auto & internal_column = static_cast<const ColumnType &>(*write_column);
|
|
||||||
arrow::StringBuilder builder;
|
|
||||||
arrow::Status status;
|
|
||||||
|
|
||||||
for (size_t string_i = 0, size = internal_column.size(); string_i < size; ++string_i)
|
|
||||||
{
|
|
||||||
if (null_bytemap && (*null_bytemap)[string_i])
|
|
||||||
{
|
|
||||||
status = builder.AppendNull();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
StringRef string_ref = internal_column.getDataAt(string_i);
|
|
||||||
status = builder.Append(string_ref.data, string_ref.size);
|
|
||||||
}
|
|
||||||
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
status = builder.Finish(&arrow_array);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
void fillArrowArrayWithDateColumnData(
|
|
||||||
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
|
|
||||||
{
|
|
||||||
const PaddedPODArray<UInt16> & internal_data = static_cast<const ColumnVector<UInt16> &>(*write_column).getData();
|
|
||||||
//arrow::Date32Builder date_builder;
|
|
||||||
arrow::UInt16Builder builder;
|
|
||||||
arrow::Status status;
|
|
||||||
|
|
||||||
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i)
|
|
||||||
{
|
|
||||||
if (null_bytemap && (*null_bytemap)[value_i])
|
|
||||||
status = builder.AppendNull();
|
|
||||||
else
|
|
||||||
/// Implicitly converts UInt16 to Int32
|
|
||||||
status = builder.Append(internal_data[value_i]);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
status = builder.Finish(&arrow_array);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
void fillArrowArrayWithDateTimeColumnData(
|
|
||||||
ColumnPtr write_column, std::shared_ptr<arrow::Array> & arrow_array, const PaddedPODArray<UInt8> * null_bytemap)
|
|
||||||
{
|
|
||||||
auto & internal_data = static_cast<const ColumnVector<UInt32> &>(*write_column).getData();
|
|
||||||
//arrow::Date64Builder builder;
|
|
||||||
arrow::UInt32Builder builder;
|
|
||||||
arrow::Status status;
|
|
||||||
|
|
||||||
for (size_t value_i = 0, size = internal_data.size(); value_i < size; ++value_i)
|
|
||||||
{
|
|
||||||
if (null_bytemap && (*null_bytemap)[value_i])
|
|
||||||
status = builder.AppendNull();
|
|
||||||
else
|
|
||||||
/// Implicitly converts UInt16 to Int32
|
|
||||||
//status = date_builder.Append(static_cast<int64_t>(internal_data[value_i]) * 1000); // now ms. TODO check other units
|
|
||||||
status = builder.Append(internal_data[value_i]);
|
|
||||||
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
status = builder.Finish(&arrow_array);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename DataType>
|
|
||||||
void fillArrowArrayWithDecimalColumnData(
|
|
||||||
ColumnPtr write_column,
|
|
||||||
std::shared_ptr<arrow::Array> & arrow_array,
|
|
||||||
const PaddedPODArray<UInt8> * null_bytemap,
|
|
||||||
const DataType * decimal_type)
|
|
||||||
{
|
|
||||||
const auto & column = static_cast<const typename DataType::ColumnType &>(*write_column);
|
|
||||||
arrow::DecimalBuilder builder(arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()));
|
|
||||||
arrow::Status status;
|
|
||||||
|
|
||||||
for (size_t value_i = 0, size = column.size(); value_i < size; ++value_i)
|
|
||||||
{
|
|
||||||
if (null_bytemap && (*null_bytemap)[value_i])
|
|
||||||
status = builder.AppendNull();
|
|
||||||
else
|
|
||||||
status = builder.Append(
|
|
||||||
arrow::Decimal128(reinterpret_cast<const uint8_t *>(&column.getElement(value_i).value))); // TODO: try copy column
|
|
||||||
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
}
|
|
||||||
status = builder.Finish(&arrow_array);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
|
|
||||||
/* TODO column copy
|
|
||||||
const auto & internal_data = static_cast<const typename DataType::ColumnType &>(*write_column).getData();
|
|
||||||
//ArrowBuilderType numeric_builder;
|
|
||||||
arrow::DecimalBuilder builder(arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()));
|
|
||||||
arrow::Status status;
|
|
||||||
|
|
||||||
const uint8_t * arrow_null_bytemap_raw_ptr = nullptr;
|
|
||||||
PaddedPODArray<UInt8> arrow_null_bytemap;
|
|
||||||
if (null_bytemap)
|
|
||||||
{
|
|
||||||
/// Invert values since Arrow interprets 1 as a non-null value, while CH as a null
|
|
||||||
arrow_null_bytemap.reserve(null_bytemap->size());
|
|
||||||
for (size_t i = 0, size = null_bytemap->size(); i < size; ++i)
|
|
||||||
arrow_null_bytemap.emplace_back(1 ^ (*null_bytemap)[i]);
|
|
||||||
|
|
||||||
arrow_null_bytemap_raw_ptr = arrow_null_bytemap.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
status = builder.AppendValues(reinterpret_cast<const uint8_t*>(internal_data.data()), internal_data.size(), arrow_null_bytemap_raw_ptr);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
|
|
||||||
status = builder.Finish(&arrow_array);
|
|
||||||
checkStatus(status, write_column->getName());
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
# define FOR_INTERNAL_NUMERIC_TYPES(M) \
|
|
||||||
M(UInt8, arrow::UInt8Builder) \
|
|
||||||
M(Int8, arrow::Int8Builder) \
|
|
||||||
M(UInt16, arrow::UInt16Builder) \
|
|
||||||
M(Int16, arrow::Int16Builder) \
|
|
||||||
M(UInt32, arrow::UInt32Builder) \
|
|
||||||
M(Int32, arrow::Int32Builder) \
|
|
||||||
M(UInt64, arrow::UInt64Builder) \
|
|
||||||
M(Int64, arrow::Int64Builder) \
|
|
||||||
M(Float32, arrow::FloatBuilder) \
|
|
||||||
M(Float64, arrow::DoubleBuilder)
|
|
||||||
|
|
||||||
const std::unordered_map<String, std::shared_ptr<arrow::DataType>> internal_type_to_arrow_type = {
|
|
||||||
{"UInt8", arrow::uint8()},
|
|
||||||
{"Int8", arrow::int8()},
|
|
||||||
{"UInt16", arrow::uint16()},
|
|
||||||
{"Int16", arrow::int16()},
|
|
||||||
{"UInt32", arrow::uint32()},
|
|
||||||
{"Int32", arrow::int32()},
|
|
||||||
{"UInt64", arrow::uint64()},
|
|
||||||
{"Int64", arrow::int64()},
|
|
||||||
{"Float32", arrow::float32()},
|
|
||||||
{"Float64", arrow::float64()},
|
|
||||||
|
|
||||||
//{"Date", arrow::date64()},
|
|
||||||
//{"Date", arrow::date32()},
|
|
||||||
{"Date", arrow::uint16()}, // CHECK
|
|
||||||
//{"DateTime", arrow::date64()}, // BUG! saves as date32
|
|
||||||
{"DateTime", arrow::uint32()},
|
|
||||||
|
|
||||||
// TODO: ClickHouse can actually store non-utf8 strings!
|
|
||||||
{"String", arrow::utf8()},
|
|
||||||
{"FixedString", arrow::utf8()},
|
|
||||||
};
|
|
||||||
|
|
||||||
const PaddedPODArray<UInt8> * extractNullBytemapPtr(ColumnPtr column)
|
|
||||||
{
|
|
||||||
ColumnPtr null_column = static_cast<const ColumnNullable &>(*column).getNullMapColumnPtr();
|
|
||||||
const PaddedPODArray<UInt8> & null_bytemap = static_cast<const ColumnVector<UInt8> &>(*null_column).getData();
|
|
||||||
return &null_bytemap;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class OstreamOutputStream : public parquet::OutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
explicit OstreamOutputStream(WriteBuffer & ostr_) : ostr(ostr_) {}
|
|
||||||
virtual ~OstreamOutputStream() {}
|
|
||||||
virtual void Close() {}
|
|
||||||
virtual int64_t Tell() { return total_length; }
|
|
||||||
virtual void Write(const uint8_t * data, int64_t length)
|
|
||||||
{
|
|
||||||
ostr.write(reinterpret_cast<const char *>(data), length);
|
|
||||||
total_length += length;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
int64_t total_length = 0;
|
|
||||||
|
|
||||||
PARQUET_DISALLOW_COPY_AND_ASSIGN(OstreamOutputStream);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
void ParquetBlockOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
block.checkNumberOfRows();
|
|
||||||
|
|
||||||
const size_t columns_num = block.columns();
|
|
||||||
|
|
||||||
/// For arrow::Schema and arrow::Table creation
|
|
||||||
std::vector<std::shared_ptr<arrow::Field>> arrow_fields;
|
|
||||||
std::vector<std::shared_ptr<arrow::Array>> arrow_arrays;
|
|
||||||
arrow_fields.reserve(columns_num);
|
|
||||||
arrow_arrays.reserve(columns_num);
|
|
||||||
|
|
||||||
for (size_t column_i = 0; column_i < columns_num; ++column_i)
|
|
||||||
{
|
|
||||||
// TODO: constructed every iteration
|
|
||||||
const ColumnWithTypeAndName & column = block.safeGetByPosition(column_i);
|
|
||||||
|
|
||||||
const bool is_column_nullable = column.type->isNullable();
|
|
||||||
const auto & column_nested_type
|
|
||||||
= is_column_nullable ? static_cast<const DataTypeNullable *>(column.type.get())->getNestedType() : column.type;
|
|
||||||
const std::string column_nested_type_name = column_nested_type->getFamilyName();
|
|
||||||
|
|
||||||
if (isDecimal(column_nested_type))
|
|
||||||
{
|
|
||||||
const auto add_decimal_field = [&](const auto & types) -> bool {
|
|
||||||
using Types = std::decay_t<decltype(types)>;
|
|
||||||
using ToDataType = typename Types::LeftType;
|
|
||||||
|
|
||||||
if constexpr (
|
|
||||||
std::is_same_v<
|
|
||||||
ToDataType,
|
|
||||||
DataTypeDecimal<
|
|
||||||
Decimal32>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
|
|
||||||
{
|
|
||||||
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
|
|
||||||
arrow_fields.emplace_back(std::make_shared<arrow::Field>(
|
|
||||||
column.name, arrow::decimal(decimal_type->getPrecision(), decimal_type->getScale()), is_column_nullable));
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), add_decimal_field);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (internal_type_to_arrow_type.find(column_nested_type_name) == internal_type_to_arrow_type.end())
|
|
||||||
{
|
|
||||||
throw Exception{"The type \"" + column_nested_type_name + "\" of a column \"" + column.name
|
|
||||||
+ "\""
|
|
||||||
" is not supported for conversion into a Parquet data format",
|
|
||||||
ErrorCodes::UNKNOWN_TYPE};
|
|
||||||
}
|
|
||||||
|
|
||||||
arrow_fields.emplace_back(std::make_shared<arrow::Field>(column.name, internal_type_to_arrow_type.at(column_nested_type_name), is_column_nullable));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<arrow::Array> arrow_array;
|
|
||||||
|
|
||||||
ColumnPtr nested_column
|
|
||||||
= is_column_nullable ? static_cast<const ColumnNullable &>(*column.column).getNestedColumnPtr() : column.column;
|
|
||||||
const PaddedPODArray<UInt8> * null_bytemap = is_column_nullable ? extractNullBytemapPtr(column.column) : nullptr;
|
|
||||||
|
|
||||||
if ("String" == column_nested_type_name)
|
|
||||||
{
|
|
||||||
fillArrowArrayWithStringColumnData<ColumnString>(nested_column, arrow_array, null_bytemap);
|
|
||||||
}
|
|
||||||
else if ("FixedString" == column_nested_type_name)
|
|
||||||
{
|
|
||||||
fillArrowArrayWithStringColumnData<ColumnFixedString>(nested_column, arrow_array, null_bytemap);
|
|
||||||
}
|
|
||||||
else if ("Date" == column_nested_type_name)
|
|
||||||
{
|
|
||||||
fillArrowArrayWithDateColumnData(nested_column, arrow_array, null_bytemap);
|
|
||||||
}
|
|
||||||
else if ("DateTime" == column_nested_type_name)
|
|
||||||
{
|
|
||||||
fillArrowArrayWithDateTimeColumnData(nested_column, arrow_array, null_bytemap);
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (isDecimal(column_nested_type))
|
|
||||||
{
|
|
||||||
auto fill_decimal = [&](const auto & types) -> bool
|
|
||||||
{
|
|
||||||
using Types = std::decay_t<decltype(types)>;
|
|
||||||
using ToDataType = typename Types::LeftType;
|
|
||||||
if constexpr (
|
|
||||||
std::is_same_v<
|
|
||||||
ToDataType,
|
|
||||||
DataTypeDecimal<
|
|
||||||
Decimal32>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal64>> || std::is_same_v<ToDataType, DataTypeDecimal<Decimal128>>)
|
|
||||||
{
|
|
||||||
const auto & decimal_type = static_cast<const ToDataType *>(column_nested_type.get());
|
|
||||||
fillArrowArrayWithDecimalColumnData(nested_column, arrow_array, null_bytemap, decimal_type);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
callOnIndexAndDataType<void>(column_nested_type->getTypeId(), fill_decimal);
|
|
||||||
}
|
|
||||||
# define DISPATCH(CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE) \
|
|
||||||
else if (#CPP_NUMERIC_TYPE == column_nested_type_name) \
|
|
||||||
{ \
|
|
||||||
fillArrowArrayWithNumericColumnData<CPP_NUMERIC_TYPE, ARROW_BUILDER_TYPE>(nested_column, arrow_array, null_bytemap); \
|
|
||||||
}
|
|
||||||
|
|
||||||
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
|
|
||||||
# undef DISPATCH
|
|
||||||
else
|
|
||||||
{
|
|
||||||
throw Exception{"Internal type \"" + column_nested_type_name + "\" of a column \"" + column.name
|
|
||||||
+ "\""
|
|
||||||
" is not supported for conversion into a Parquet data format",
|
|
||||||
ErrorCodes::UNKNOWN_TYPE};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
arrow_arrays.emplace_back(std::move(arrow_array));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<arrow::Schema> arrow_schema = std::make_shared<arrow::Schema>(std::move(arrow_fields));
|
|
||||||
|
|
||||||
std::shared_ptr<arrow::Table> arrow_table = arrow::Table::Make(arrow_schema, arrow_arrays);
|
|
||||||
|
|
||||||
auto sink = std::make_shared<OstreamOutputStream>(ostr);
|
|
||||||
|
|
||||||
if (!file_writer)
|
|
||||||
{
|
|
||||||
|
|
||||||
parquet::WriterProperties::Builder builder;
|
|
||||||
#if USE_SNAPPY
|
|
||||||
builder.compression(parquet::Compression::SNAPPY);
|
|
||||||
#endif
|
|
||||||
auto props = builder.build();
|
|
||||||
auto status = parquet::arrow::FileWriter::Open(
|
|
||||||
*arrow_table->schema(),
|
|
||||||
arrow::default_memory_pool(),
|
|
||||||
sink,
|
|
||||||
props, /*parquet::default_writer_properties(),*/
|
|
||||||
parquet::arrow::default_arrow_writer_properties(),
|
|
||||||
&file_writer);
|
|
||||||
if (!status.ok())
|
|
||||||
throw Exception{"Error while opening a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: calculate row_group_size depending on a number of rows and table size
|
|
||||||
auto status = file_writer->WriteTable(*arrow_table, format_settings.parquet.row_group_size);
|
|
||||||
|
|
||||||
if (!status.ok())
|
|
||||||
throw Exception{"Error while writing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
|
|
||||||
}
|
|
||||||
|
|
||||||
void ParquetBlockOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
if (file_writer)
|
|
||||||
{
|
|
||||||
auto status = file_writer->Close();
|
|
||||||
if (!status.ok())
|
|
||||||
throw Exception{"Error while closing a table: " + status.ToString(), ErrorCodes::UNKNOWN_EXCEPTION};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatParquet(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(
|
|
||||||
"Parquet", [](WriteBuffer & buf, const Block & sample, const Context & /*context*/, const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
BlockOutputStreamPtr impl = std::make_shared<ParquetBlockOutputStream>(buf, sample, format_settings);
|
|
||||||
auto res = std::make_shared<SquashingBlockOutputStream>(impl, impl->getHeader(), format_settings.parquet.row_group_size, 0);
|
|
||||||
res->disableFlush();
|
|
||||||
return res;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class FormatFactory;
|
|
||||||
void registerOutputFormatParquet(FormatFactory &)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,46 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "config_formats.h"
|
|
||||||
#if USE_PARQUET
|
|
||||||
# include <DataStreams/IBlockOutputStream.h>
|
|
||||||
# include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
namespace arrow
|
|
||||||
{
|
|
||||||
class Array;
|
|
||||||
class DataType;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace parquet
|
|
||||||
{
|
|
||||||
namespace arrow
|
|
||||||
{
|
|
||||||
class FileWriter;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class ParquetBlockOutputStream : public IBlockOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ParquetBlockOutputStream(WriteBuffer & ostr_, const Block & header_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
Block getHeader() const override { return header; }
|
|
||||||
void write(const Block & block) override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
void flush() override;
|
|
||||||
|
|
||||||
String getContentType() const override { return "application/octet-stream"; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
Block header;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
|
|
||||||
std::unique_ptr<parquet::arrow::FileWriter> file_writer;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,277 +0,0 @@
|
|||||||
#include <sys/ioctl.h>
|
|
||||||
#include <port/unistd.h>
|
|
||||||
#include <Formats/PrettyBlockOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <IO/WriteBufferFromString.h>
|
|
||||||
#include <Common/UTF8Helpers.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int ILLEGAL_COLUMN;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
PrettyBlockOutputStream::PrettyBlockOutputStream(
|
|
||||||
WriteBuffer & ostr_, const Block & header_, const FormatSettings & format_settings)
|
|
||||||
: ostr(ostr_), header(header_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
struct winsize w;
|
|
||||||
if (0 == ioctl(STDOUT_FILENO, TIOCGWINSZ, &w))
|
|
||||||
terminal_width = w.ws_col;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void PrettyBlockOutputStream::flush()
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Evaluate the visible width of the values and column names.
|
|
||||||
/// Note that number of code points is just a rough approximation of visible string width.
|
|
||||||
void PrettyBlockOutputStream::calculateWidths(
|
|
||||||
const Block & block, WidthsPerColumn & widths, Widths & max_widths, Widths & name_widths, const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
size_t rows = block.rows();
|
|
||||||
size_t columns = block.columns();
|
|
||||||
|
|
||||||
widths.resize(columns);
|
|
||||||
max_widths.resize_fill(columns);
|
|
||||||
name_widths.resize(columns);
|
|
||||||
|
|
||||||
/// Calculate widths of all values.
|
|
||||||
String serialized_value;
|
|
||||||
size_t prefix = 2; // Tab character adjustment
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
const ColumnWithTypeAndName & elem = block.getByPosition(i);
|
|
||||||
|
|
||||||
widths[i].resize(rows);
|
|
||||||
|
|
||||||
for (size_t j = 0; j < rows; ++j)
|
|
||||||
{
|
|
||||||
{
|
|
||||||
WriteBufferFromString out(serialized_value);
|
|
||||||
elem.type->serializeAsText(*elem.column, j, out, format_settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
widths[i][j] = std::min<UInt64>(format_settings.pretty.max_column_pad_width,
|
|
||||||
UTF8::computeWidth(reinterpret_cast<const UInt8 *>(serialized_value.data()), serialized_value.size(), prefix));
|
|
||||||
max_widths[i] = std::max(max_widths[i], widths[i][j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// And also calculate widths for names of columns.
|
|
||||||
{
|
|
||||||
// name string doesn't contain Tab, no need to pass `prefix`
|
|
||||||
name_widths[i] = std::min<UInt64>(format_settings.pretty.max_column_pad_width,
|
|
||||||
UTF8::computeWidth(reinterpret_cast<const UInt8 *>(elem.name.data()), elem.name.size()));
|
|
||||||
max_widths[i] = std::max(max_widths[i], name_widths[i]);
|
|
||||||
}
|
|
||||||
prefix += max_widths[i] + 3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void PrettyBlockOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
UInt64 max_rows = format_settings.pretty.max_rows;
|
|
||||||
|
|
||||||
if (total_rows >= max_rows)
|
|
||||||
{
|
|
||||||
total_rows += block.rows();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t rows = block.rows();
|
|
||||||
size_t columns = block.columns();
|
|
||||||
|
|
||||||
WidthsPerColumn widths;
|
|
||||||
Widths max_widths;
|
|
||||||
Widths name_widths;
|
|
||||||
calculateWidths(block, widths, max_widths, name_widths, format_settings);
|
|
||||||
|
|
||||||
/// Create separators
|
|
||||||
std::stringstream top_separator;
|
|
||||||
std::stringstream middle_names_separator;
|
|
||||||
std::stringstream middle_values_separator;
|
|
||||||
std::stringstream bottom_separator;
|
|
||||||
|
|
||||||
top_separator << "┏";
|
|
||||||
middle_names_separator << "┡";
|
|
||||||
middle_values_separator << "├";
|
|
||||||
bottom_separator << "└";
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
{
|
|
||||||
top_separator << "┳";
|
|
||||||
middle_names_separator << "╇";
|
|
||||||
middle_values_separator << "┼";
|
|
||||||
bottom_separator << "┴";
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t j = 0; j < max_widths[i] + 2; ++j)
|
|
||||||
{
|
|
||||||
top_separator << "━";
|
|
||||||
middle_names_separator << "━";
|
|
||||||
middle_values_separator << "─";
|
|
||||||
bottom_separator << "─";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
top_separator << "┓\n";
|
|
||||||
middle_names_separator << "┩\n";
|
|
||||||
middle_values_separator << "┤\n";
|
|
||||||
bottom_separator << "┘\n";
|
|
||||||
|
|
||||||
std::string top_separator_s = top_separator.str();
|
|
||||||
std::string middle_names_separator_s = middle_names_separator.str();
|
|
||||||
std::string middle_values_separator_s = middle_values_separator.str();
|
|
||||||
std::string bottom_separator_s = bottom_separator.str();
|
|
||||||
|
|
||||||
/// Output the block
|
|
||||||
writeString(top_separator_s, ostr);
|
|
||||||
|
|
||||||
/// Names
|
|
||||||
writeCString("┃ ", ostr);
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeCString(" ┃ ", ostr);
|
|
||||||
|
|
||||||
const ColumnWithTypeAndName & col = block.getByPosition(i);
|
|
||||||
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[1m", ostr);
|
|
||||||
|
|
||||||
if (col.type->shouldAlignRightInPrettyFormats())
|
|
||||||
{
|
|
||||||
for (size_t k = 0; k < max_widths[i] - name_widths[i]; ++k)
|
|
||||||
writeChar(' ', ostr);
|
|
||||||
|
|
||||||
writeString(col.name, ostr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
writeString(col.name, ostr);
|
|
||||||
|
|
||||||
for (size_t k = 0; k < max_widths[i] - name_widths[i]; ++k)
|
|
||||||
writeChar(' ', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[0m", ostr);
|
|
||||||
}
|
|
||||||
writeCString(" ┃\n", ostr);
|
|
||||||
|
|
||||||
writeString(middle_names_separator_s, ostr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows && total_rows + i < max_rows; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeString(middle_values_separator_s, ostr);
|
|
||||||
|
|
||||||
writeCString("│ ", ostr);
|
|
||||||
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
if (j != 0)
|
|
||||||
writeCString(" │ ", ostr);
|
|
||||||
|
|
||||||
writeValueWithPadding(block.getByPosition(j), i, widths[j].empty() ? max_widths[j] : widths[j][i], max_widths[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeCString(" │\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeString(bottom_separator_s, ostr);
|
|
||||||
|
|
||||||
total_rows += rows;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void PrettyBlockOutputStream::writeValueWithPadding(const ColumnWithTypeAndName & elem, size_t row_num, size_t value_width, size_t pad_to_width)
|
|
||||||
{
|
|
||||||
auto writePadding = [&]()
|
|
||||||
{
|
|
||||||
for (size_t k = 0; k < pad_to_width - value_width; ++k)
|
|
||||||
writeChar(' ', ostr);
|
|
||||||
};
|
|
||||||
|
|
||||||
if (elem.type->shouldAlignRightInPrettyFormats())
|
|
||||||
{
|
|
||||||
writePadding();
|
|
||||||
elem.type->serializeAsText(*elem.column.get(), row_num, ostr, format_settings);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
elem.type->serializeAsText(*elem.column.get(), row_num, ostr, format_settings);
|
|
||||||
writePadding();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void PrettyBlockOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
if (total_rows >= format_settings.pretty.max_rows)
|
|
||||||
{
|
|
||||||
writeCString(" Showed first ", ostr);
|
|
||||||
writeIntText(format_settings.pretty.max_rows, ostr);
|
|
||||||
writeCString(".\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
total_rows = 0;
|
|
||||||
writeTotals();
|
|
||||||
writeExtremes();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void PrettyBlockOutputStream::writeTotals()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
{
|
|
||||||
writeCString("\nTotals:\n", ostr);
|
|
||||||
write(totals);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void PrettyBlockOutputStream::writeExtremes()
|
|
||||||
{
|
|
||||||
if (extremes)
|
|
||||||
{
|
|
||||||
writeCString("\nExtremes:\n", ostr);
|
|
||||||
write(extremes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatPretty(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("Pretty", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<PrettyBlockOutputStream>(buf, sample, format_settings);
|
|
||||||
});
|
|
||||||
|
|
||||||
factory.registerOutputFormat("PrettyNoEscapes", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
FormatSettings changed_settings = format_settings;
|
|
||||||
changed_settings.pretty.color = false;
|
|
||||||
return std::make_shared<PrettyBlockOutputStream>(buf, sample, changed_settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,55 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <DataStreams/IBlockOutputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class WriteBuffer;
|
|
||||||
class Context;
|
|
||||||
|
|
||||||
|
|
||||||
/** Prints the result in the form of beautiful tables.
|
|
||||||
*/
|
|
||||||
class PrettyBlockOutputStream : public IBlockOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
/// no_escapes - do not use ANSI escape sequences - to display in the browser, not in the console.
|
|
||||||
PrettyBlockOutputStream(WriteBuffer & ostr_, const Block & header_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
Block getHeader() const override { return header; }
|
|
||||||
void write(const Block & block) override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
|
|
||||||
void flush() override;
|
|
||||||
|
|
||||||
void setTotals(const Block & totals_) override { totals = totals_; }
|
|
||||||
void setExtremes(const Block & extremes_) override { extremes = extremes_; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void writeTotals();
|
|
||||||
void writeExtremes();
|
|
||||||
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
const Block header;
|
|
||||||
size_t total_rows = 0;
|
|
||||||
size_t terminal_width = 0;
|
|
||||||
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
|
|
||||||
Block totals;
|
|
||||||
Block extremes;
|
|
||||||
|
|
||||||
using Widths = PODArray<size_t>;
|
|
||||||
using WidthsPerColumn = std::vector<Widths>;
|
|
||||||
|
|
||||||
static void calculateWidths(
|
|
||||||
const Block & block, WidthsPerColumn & widths, Widths & max_widths, Widths & name_widths, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
void writeValueWithPadding(const ColumnWithTypeAndName & elem, size_t row_num, size_t value_width, size_t pad_to_width);
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,161 +0,0 @@
|
|||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <DataStreams/SquashingBlockOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/PrettyCompactBlockOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
|
|
||||||
extern const int ILLEGAL_COLUMN;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrettyCompactBlockOutputStream::writeHeader(
|
|
||||||
const Block & block,
|
|
||||||
const Widths & max_widths,
|
|
||||||
const Widths & name_widths)
|
|
||||||
{
|
|
||||||
/// Names
|
|
||||||
writeCString("┌─", ostr);
|
|
||||||
for (size_t i = 0; i < max_widths.size(); ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeCString("─┬─", ostr);
|
|
||||||
|
|
||||||
const ColumnWithTypeAndName & col = block.getByPosition(i);
|
|
||||||
|
|
||||||
if (col.type->shouldAlignRightInPrettyFormats())
|
|
||||||
{
|
|
||||||
for (size_t k = 0; k < max_widths[i] - name_widths[i]; ++k)
|
|
||||||
writeCString("─", ostr);
|
|
||||||
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[1m", ostr);
|
|
||||||
writeString(col.name, ostr);
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[0m", ostr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[1m", ostr);
|
|
||||||
writeString(col.name, ostr);
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[0m", ostr);
|
|
||||||
|
|
||||||
for (size_t k = 0; k < max_widths[i] - name_widths[i]; ++k)
|
|
||||||
writeCString("─", ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
writeCString("─┐\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrettyCompactBlockOutputStream::writeBottom(const Widths & max_widths)
|
|
||||||
{
|
|
||||||
/// Create delimiters
|
|
||||||
std::stringstream bottom_separator;
|
|
||||||
|
|
||||||
bottom_separator << "└";
|
|
||||||
for (size_t i = 0; i < max_widths.size(); ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
bottom_separator << "┴";
|
|
||||||
|
|
||||||
for (size_t j = 0; j < max_widths[i] + 2; ++j)
|
|
||||||
bottom_separator << "─";
|
|
||||||
}
|
|
||||||
bottom_separator << "┘\n";
|
|
||||||
|
|
||||||
writeString(bottom_separator.str(), ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrettyCompactBlockOutputStream::writeRow(
|
|
||||||
size_t row_num,
|
|
||||||
const Block & block,
|
|
||||||
const WidthsPerColumn & widths,
|
|
||||||
const Widths & max_widths)
|
|
||||||
{
|
|
||||||
size_t columns = max_widths.size();
|
|
||||||
|
|
||||||
writeCString("│ ", ostr);
|
|
||||||
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
if (j != 0)
|
|
||||||
writeCString(" │ ", ostr);
|
|
||||||
|
|
||||||
writeValueWithPadding(block.getByPosition(j), row_num, widths[j].empty() ? max_widths[j] : widths[j][row_num], max_widths[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeCString(" │\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrettyCompactBlockOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
UInt64 max_rows = format_settings.pretty.max_rows;
|
|
||||||
|
|
||||||
if (total_rows >= max_rows)
|
|
||||||
{
|
|
||||||
total_rows += block.rows();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t rows = block.rows();
|
|
||||||
|
|
||||||
WidthsPerColumn widths;
|
|
||||||
Widths max_widths;
|
|
||||||
Widths name_widths;
|
|
||||||
calculateWidths(block, widths, max_widths, name_widths, format_settings);
|
|
||||||
|
|
||||||
writeHeader(block, max_widths, name_widths);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows && total_rows + i < max_rows; ++i)
|
|
||||||
writeRow(i, block, widths, max_widths);
|
|
||||||
|
|
||||||
writeBottom(max_widths);
|
|
||||||
|
|
||||||
total_rows += rows;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatPrettyCompact(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("PrettyCompact", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<PrettyCompactBlockOutputStream>(buf, sample, format_settings);
|
|
||||||
});
|
|
||||||
|
|
||||||
factory.registerOutputFormat("PrettyCompactNoEscapes", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
FormatSettings changed_settings = format_settings;
|
|
||||||
changed_settings.pretty.color = false;
|
|
||||||
return std::make_shared<PrettyCompactBlockOutputStream>(buf, sample, changed_settings);
|
|
||||||
});
|
|
||||||
|
|
||||||
factory.registerOutputFormat("PrettyCompactMonoBlock", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
BlockOutputStreamPtr impl = std::make_shared<PrettyCompactBlockOutputStream>(buf, sample, format_settings);
|
|
||||||
auto res = std::make_shared<SquashingBlockOutputStream>(impl, impl->getHeader(), format_settings.pretty.max_rows, 0);
|
|
||||||
res->disableFlush();
|
|
||||||
return res;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,25 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Formats/PrettyBlockOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** Prints the result in the form of beautiful tables, but with fewer delimiter lines.
|
|
||||||
*/
|
|
||||||
class PrettyCompactBlockOutputStream : public PrettyBlockOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
PrettyCompactBlockOutputStream(WriteBuffer & ostr_, const Block & header_, const FormatSettings & format_settings_)
|
|
||||||
: PrettyBlockOutputStream(ostr_, header_, format_settings_) {}
|
|
||||||
|
|
||||||
void write(const Block & block) override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void writeHeader(const Block & block, const Widths & max_widths, const Widths & name_widths);
|
|
||||||
void writeBottom(const Widths & max_widths);
|
|
||||||
void writeRow(size_t row_num, const Block & block, const WidthsPerColumn & widths, const Widths & max_widths);
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,116 +0,0 @@
|
|||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/PrettySpaceBlockOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
void PrettySpaceBlockOutputStream::write(const Block & block)
|
|
||||||
{
|
|
||||||
UInt64 max_rows = format_settings.pretty.max_rows;
|
|
||||||
|
|
||||||
if (total_rows >= max_rows)
|
|
||||||
{
|
|
||||||
total_rows += block.rows();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t rows = block.rows();
|
|
||||||
size_t columns = block.columns();
|
|
||||||
|
|
||||||
WidthsPerColumn widths;
|
|
||||||
Widths max_widths;
|
|
||||||
Widths name_widths;
|
|
||||||
calculateWidths(block, widths, max_widths, name_widths, format_settings);
|
|
||||||
|
|
||||||
/// Names
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeCString(" ", ostr);
|
|
||||||
|
|
||||||
const ColumnWithTypeAndName & col = block.getByPosition(i);
|
|
||||||
|
|
||||||
if (col.type->shouldAlignRightInPrettyFormats())
|
|
||||||
{
|
|
||||||
for (ssize_t k = 0; k < std::max(static_cast<ssize_t>(0), static_cast<ssize_t>(max_widths[i] - name_widths[i])); ++k)
|
|
||||||
writeChar(' ', ostr);
|
|
||||||
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[1m", ostr);
|
|
||||||
writeString(col.name, ostr);
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[0m", ostr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[1m", ostr);
|
|
||||||
writeString(col.name, ostr);
|
|
||||||
if (format_settings.pretty.color)
|
|
||||||
writeCString("\033[0m", ostr);
|
|
||||||
|
|
||||||
for (ssize_t k = 0; k < std::max(static_cast<ssize_t>(0), static_cast<ssize_t>(max_widths[i] - name_widths[i])); ++k)
|
|
||||||
writeChar(' ', ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
writeCString("\n\n", ostr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows && total_rows + i < max_rows; ++i)
|
|
||||||
{
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
if (j != 0)
|
|
||||||
writeCString(" ", ostr);
|
|
||||||
|
|
||||||
writeValueWithPadding(block.getByPosition(j), i, widths[j].empty() ? max_widths[j] : widths[j][i], max_widths[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
total_rows += rows;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void PrettySpaceBlockOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
if (total_rows >= format_settings.pretty.max_rows)
|
|
||||||
{
|
|
||||||
writeCString("\nShowed first ", ostr);
|
|
||||||
writeIntText(format_settings.pretty.max_rows, ostr);
|
|
||||||
writeCString(".\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
total_rows = 0;
|
|
||||||
writeTotals();
|
|
||||||
writeExtremes();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatPrettySpace(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("PrettySpace", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<PrettySpaceBlockOutputStream>(buf, sample, format_settings);
|
|
||||||
});
|
|
||||||
|
|
||||||
factory.registerOutputFormat("PrettySpaceNoEscapes", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
FormatSettings changed_settings = format_settings;
|
|
||||||
changed_settings.pretty.color = false;
|
|
||||||
return std::make_shared<PrettySpaceBlockOutputStream>(buf, sample, changed_settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,21 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Formats/PrettyBlockOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** Prints the result, aligned with spaces.
|
|
||||||
*/
|
|
||||||
class PrettySpaceBlockOutputStream : public PrettyBlockOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
PrettySpaceBlockOutputStream(WriteBuffer & ostr_, const Block & header_, const FormatSettings & format_settings_)
|
|
||||||
: PrettyBlockOutputStream(ostr_, header_, format_settings_) {}
|
|
||||||
|
|
||||||
void write(const Block & block) override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,94 +0,0 @@
|
|||||||
#include "ProtobufRowInputStream.h"
|
|
||||||
|
|
||||||
#if USE_PROTOBUF
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/FormatSchemaInfo.h>
|
|
||||||
#include <Formats/ProtobufSchemas.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
ProtobufRowInputStream::ProtobufRowInputStream(ReadBuffer & in_, const Block & header, const FormatSchemaInfo & format_schema)
|
|
||||||
: data_types(header.getDataTypes()), reader(in_, ProtobufSchemas::instance().getMessageTypeForFormatSchema(format_schema), header.getNames())
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
ProtobufRowInputStream::~ProtobufRowInputStream() = default;
|
|
||||||
|
|
||||||
bool ProtobufRowInputStream::read(MutableColumns & columns, RowReadExtension & extra)
|
|
||||||
{
|
|
||||||
if (!reader.startMessage())
|
|
||||||
return false; // EOF reached, no more messages.
|
|
||||||
|
|
||||||
// Set of columns for which the values were read. The rest will be filled with default values.
|
|
||||||
auto & read_columns = extra.read_columns;
|
|
||||||
read_columns.assign(columns.size(), false);
|
|
||||||
|
|
||||||
// Read values from this message and put them to the columns while it's possible.
|
|
||||||
size_t column_index;
|
|
||||||
while (reader.readColumnIndex(column_index))
|
|
||||||
{
|
|
||||||
bool allow_add_row = !static_cast<bool>(read_columns[column_index]);
|
|
||||||
do
|
|
||||||
{
|
|
||||||
bool row_added;
|
|
||||||
data_types[column_index]->deserializeProtobuf(*columns[column_index], reader, allow_add_row, row_added);
|
|
||||||
if (row_added)
|
|
||||||
{
|
|
||||||
read_columns[column_index] = true;
|
|
||||||
allow_add_row = false;
|
|
||||||
}
|
|
||||||
} while (reader.canReadMoreValues());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fill non-visited columns with the default values.
|
|
||||||
for (column_index = 0; column_index < read_columns.size(); ++column_index)
|
|
||||||
if (!read_columns[column_index])
|
|
||||||
data_types[column_index]->insertDefaultInto(*columns[column_index]);
|
|
||||||
|
|
||||||
reader.endMessage();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ProtobufRowInputStream::allowSyncAfterError() const
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ProtobufRowInputStream::syncAfterError()
|
|
||||||
{
|
|
||||||
reader.endMessage(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatProtobuf(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerInputFormat("Protobuf", [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context & context,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<ProtobufRowInputStream>(buf, sample, FormatSchemaInfo(context, "Protobuf")),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class FormatFactory;
|
|
||||||
void registerInputFormatProtobuf(FormatFactory &) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,40 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "config_formats.h"
|
|
||||||
#if USE_PROTOBUF
|
|
||||||
|
|
||||||
#include <DataTypes/IDataType.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
#include <Formats/ProtobufReader.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class Block;
|
|
||||||
class FormatSchemaInfo;
|
|
||||||
|
|
||||||
|
|
||||||
/** Stream designed to deserialize data from the google protobuf format.
|
|
||||||
* Each row is read as a separated message.
|
|
||||||
* These messages are delimited according to documentation
|
|
||||||
* https://github.com/protocolbuffers/protobuf/blob/master/src/google/protobuf/util/delimited_message_util.h
|
|
||||||
* Serializing in the protobuf format requires the 'format_schema' setting to be set, e.g.
|
|
||||||
* INSERT INTO table FORMAT Protobuf SETTINGS format_schema = 'schema:Message'
|
|
||||||
* where schema is the name of "schema.proto" file specifying protobuf schema.
|
|
||||||
*/
|
|
||||||
class ProtobufRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ProtobufRowInputStream(ReadBuffer & in_, const Block & header, const FormatSchemaInfo & format_schema);
|
|
||||||
~ProtobufRowInputStream() override;
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension & extra) override;
|
|
||||||
bool allowSyncAfterError() const override;
|
|
||||||
void syncAfterError() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
DataTypes data_types;
|
|
||||||
ProtobufReader reader;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -1,55 +0,0 @@
|
|||||||
#include <Formats/FormatFactory.h>
|
|
||||||
|
|
||||||
#include "config_formats.h"
|
|
||||||
#if USE_PROTOBUF
|
|
||||||
|
|
||||||
#include "ProtobufRowOutputStream.h"
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
#include <Formats/FormatSchemaInfo.h>
|
|
||||||
#include <Formats/ProtobufSchemas.h>
|
|
||||||
#include <google/protobuf/descriptor.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
ProtobufRowOutputStream::ProtobufRowOutputStream(WriteBuffer & out, const Block & header, const FormatSchemaInfo & format_schema)
|
|
||||||
: data_types(header.getDataTypes()), writer(out, ProtobufSchemas::instance().getMessageTypeForFormatSchema(format_schema), header.getNames())
|
|
||||||
{
|
|
||||||
value_indices.resize(header.columns());
|
|
||||||
}
|
|
||||||
|
|
||||||
void ProtobufRowOutputStream::write(const Block & block, size_t row_num)
|
|
||||||
{
|
|
||||||
writer.startMessage();
|
|
||||||
std::fill(value_indices.begin(), value_indices.end(), 0);
|
|
||||||
size_t column_index;
|
|
||||||
while (writer.writeField(column_index))
|
|
||||||
data_types[column_index]->serializeProtobuf(
|
|
||||||
*block.getByPosition(column_index).column, row_num, writer, value_indices[column_index]);
|
|
||||||
writer.endMessage();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatProtobuf(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(
|
|
||||||
"Protobuf", [](WriteBuffer & buf, const Block & header, const Context & context, const FormatSettings &)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<ProtobufRowOutputStream>(buf, header, FormatSchemaInfo(context, "Protobuf")), header);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class FormatFactory;
|
|
||||||
void registerOutputFormatProtobuf(FormatFactory &) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,44 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <DataTypes/IDataType.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
#include <Formats/ProtobufWriter.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace google
|
|
||||||
{
|
|
||||||
namespace protobuf
|
|
||||||
{
|
|
||||||
class Message;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
class Block;
|
|
||||||
class FormatSchemaInfo;
|
|
||||||
|
|
||||||
/** Stream designed to serialize data in the google protobuf format.
|
|
||||||
* Each row is written as a separated message.
|
|
||||||
* These messages are delimited according to documentation
|
|
||||||
* https://github.com/protocolbuffers/protobuf/blob/master/src/google/protobuf/util/delimited_message_util.h
|
|
||||||
* Serializing in the protobuf format requires the 'format_schema' setting to be set, e.g.
|
|
||||||
* SELECT * from table FORMAT Protobuf SETTINGS format_schema = 'schema:Message'
|
|
||||||
* where schema is the name of "schema.proto" file specifying protobuf schema.
|
|
||||||
*/
|
|
||||||
class ProtobufRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ProtobufRowOutputStream(WriteBuffer & out, const Block & header, const FormatSchemaInfo & format_schema);
|
|
||||||
|
|
||||||
void write(const Block & block, size_t row_num) override;
|
|
||||||
std::string getContentType() const override { return "application/octet-stream"; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
DataTypes data_types;
|
|
||||||
ProtobufWriter writer;
|
|
||||||
std::vector<size_t> value_indices;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,211 +0,0 @@
|
|||||||
#include <IO/ReadHelpers.h>
|
|
||||||
#include <Formats/TSKVRowInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int INCORRECT_DATA;
|
|
||||||
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
|
|
||||||
extern const int CANNOT_READ_ALL_DATA;
|
|
||||||
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
TSKVRowInputStream::TSKVRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings)
|
|
||||||
: istr(istr_), header(header_), format_settings(format_settings), name_map(header.columns())
|
|
||||||
{
|
|
||||||
/// In this format, we assume that column name cannot contain BOM,
|
|
||||||
/// so BOM at beginning of stream cannot be confused with name of field, and it is safe to skip it.
|
|
||||||
skipBOMIfExists(istr);
|
|
||||||
|
|
||||||
size_t num_columns = header.columns();
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
name_map[header.safeGetByPosition(i).name] = i; /// NOTE You could place names more cache-locally.
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Read the field name in the `tskv` format.
|
|
||||||
* Return true if the field is followed by an equal sign,
|
|
||||||
* otherwise (field with no value) return false.
|
|
||||||
* The reference to the field name will be written to `ref`.
|
|
||||||
* A temporary `tmp` buffer can also be used to copy the field name to it.
|
|
||||||
* When reading, skips the name and the equal sign after it.
|
|
||||||
*/
|
|
||||||
static bool readName(ReadBuffer & buf, StringRef & ref, String & tmp)
|
|
||||||
{
|
|
||||||
tmp.clear();
|
|
||||||
|
|
||||||
while (!buf.eof())
|
|
||||||
{
|
|
||||||
const char * next_pos = find_first_symbols<'\t', '\n', '\\', '='>(buf.position(), buf.buffer().end());
|
|
||||||
|
|
||||||
if (next_pos == buf.buffer().end())
|
|
||||||
{
|
|
||||||
tmp.append(buf.position(), next_pos - buf.position());
|
|
||||||
buf.next();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Came to the end of the name.
|
|
||||||
if (*next_pos != '\\')
|
|
||||||
{
|
|
||||||
bool have_value = *next_pos == '=';
|
|
||||||
if (tmp.empty())
|
|
||||||
{
|
|
||||||
/// No need to copy data, you can refer directly to the `buf`.
|
|
||||||
ref = StringRef(buf.position(), next_pos - buf.position());
|
|
||||||
buf.position() += next_pos + have_value - buf.position();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/// Copy the data to a temporary string and return a reference to it.
|
|
||||||
tmp.append(buf.position(), next_pos - buf.position());
|
|
||||||
buf.position() += next_pos + have_value - buf.position();
|
|
||||||
ref = StringRef(tmp);
|
|
||||||
}
|
|
||||||
return have_value;
|
|
||||||
}
|
|
||||||
/// The name has an escape sequence.
|
|
||||||
else
|
|
||||||
{
|
|
||||||
tmp.append(buf.position(), next_pos - buf.position());
|
|
||||||
buf.position() += next_pos + 1 - buf.position();
|
|
||||||
if (buf.eof())
|
|
||||||
throw Exception("Cannot parse escape sequence", ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE);
|
|
||||||
|
|
||||||
tmp.push_back(parseEscapeSequence(*buf.position()));
|
|
||||||
++buf.position();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw Exception("Unexpected end of stream while reading key name from TSKV format", ErrorCodes::CANNOT_READ_ALL_DATA);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool TSKVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
|
|
||||||
{
|
|
||||||
if (istr.eof())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
size_t num_columns = columns.size();
|
|
||||||
|
|
||||||
/// Set of columns for which the values were read. The rest will be filled with default values.
|
|
||||||
read_columns.assign(num_columns, false);
|
|
||||||
|
|
||||||
if (unlikely(*istr.position() == '\n'))
|
|
||||||
{
|
|
||||||
/// An empty string. It is permissible, but it is unclear why.
|
|
||||||
++istr.position();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
StringRef name_ref;
|
|
||||||
bool has_value = readName(istr, name_ref, name_buf);
|
|
||||||
ssize_t index = -1;
|
|
||||||
|
|
||||||
if (has_value)
|
|
||||||
{
|
|
||||||
/// NOTE Optimization is possible by caching the order of fields (which is almost always the same)
|
|
||||||
/// and quickly checking for the next expected field, instead of searching the hash table.
|
|
||||||
|
|
||||||
auto it = name_map.find(name_ref);
|
|
||||||
if (name_map.end() == it)
|
|
||||||
{
|
|
||||||
if (!format_settings.skip_unknown_fields)
|
|
||||||
throw Exception("Unknown field found while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
|
|
||||||
|
|
||||||
/// If the key is not found, skip the value.
|
|
||||||
NullSink sink;
|
|
||||||
readEscapedStringInto(sink, istr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
index = it->getSecond();
|
|
||||||
|
|
||||||
if (read_columns[index])
|
|
||||||
throw Exception("Duplicate field found while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
|
|
||||||
|
|
||||||
read_columns[index] = true;
|
|
||||||
|
|
||||||
header.getByPosition(index).type->deserializeAsTextEscaped(*columns[index], istr, format_settings);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/// The only thing that can go without value is `tskv` fragment that is ignored.
|
|
||||||
if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "tskv", 4)))
|
|
||||||
throw Exception("Found field without value while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (istr.eof())
|
|
||||||
{
|
|
||||||
throw Exception("Unexpected end of stream after field in TSKV format: " + name_ref.toString(), ErrorCodes::CANNOT_READ_ALL_DATA);
|
|
||||||
}
|
|
||||||
else if (*istr.position() == '\t')
|
|
||||||
{
|
|
||||||
++istr.position();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (*istr.position() == '\n')
|
|
||||||
{
|
|
||||||
++istr.position();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/// Possibly a garbage was written into column, remove it
|
|
||||||
if (index >= 0)
|
|
||||||
{
|
|
||||||
columns[index]->popBack(1);
|
|
||||||
read_columns[index] = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw Exception("Found garbage after field in TSKV format: " + name_ref.toString(), ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fill in the not met columns with default values.
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
if (!read_columns[i])
|
|
||||||
header.getByPosition(i).type->insertDefaultInto(*columns[i]);
|
|
||||||
|
|
||||||
/// return info about defaults set
|
|
||||||
ext.read_columns = read_columns;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TSKVRowInputStream::syncAfterError()
|
|
||||||
{
|
|
||||||
skipToUnescapedNextLineOrEOF(istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatTSKV(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerInputFormat("TSKV", [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<TSKVRowInputStream>(buf, sample, settings),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,48 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Common/HashTable/HashMap.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** Stream for reading data in TSKV format.
|
|
||||||
* TSKV is a very inefficient data format.
|
|
||||||
* Similar to TSV, but each field is written as key=value.
|
|
||||||
* Fields can be listed in any order (including, in different lines there may be different order),
|
|
||||||
* and some fields may be missing.
|
|
||||||
* An equal sign can be escaped in the field name.
|
|
||||||
* Also, as an additional element there may be a useless tskv fragment - it needs to be ignored.
|
|
||||||
*/
|
|
||||||
class TSKVRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
TSKVRowInputStream(ReadBuffer & istr_, const Block & header_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension &) override;
|
|
||||||
bool allowSyncAfterError() const override { return true; }
|
|
||||||
void syncAfterError() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
|
|
||||||
/// Buffer for the read from the stream the field name. Used when you have to copy it.
|
|
||||||
String name_buf;
|
|
||||||
|
|
||||||
/// Hash table matching `field name -> position in the block`. NOTE You can use perfect hash map.
|
|
||||||
using NameMap = HashMap<StringRef, size_t, StringRefHash>;
|
|
||||||
NameMap name_map;
|
|
||||||
|
|
||||||
std::vector<UInt8> read_columns;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,56 +0,0 @@
|
|||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <IO/WriteBufferFromString.h>
|
|
||||||
#include <Formats/TSKVRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
TSKVRowOutputStream::TSKVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & format_settings_)
|
|
||||||
: TabSeparatedRowOutputStream(ostr_, sample_, false, false, format_settings_)
|
|
||||||
{
|
|
||||||
NamesAndTypesList columns(sample_.getNamesAndTypesList());
|
|
||||||
fields.assign(columns.begin(), columns.end());
|
|
||||||
|
|
||||||
for (auto & field : fields)
|
|
||||||
{
|
|
||||||
WriteBufferFromOwnString wb;
|
|
||||||
writeAnyEscapedString<'='>(field.name.data(), field.name.data() + field.name.size(), wb);
|
|
||||||
writeCString("=", wb);
|
|
||||||
field.name = wb.str();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TSKVRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
writeString(fields[field_number].name, ostr);
|
|
||||||
type.serializeAsTextEscaped(column, row_num, ostr, format_settings);
|
|
||||||
++field_number;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TSKVRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
field_number = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatTSKV(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("TSKV", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<TSKVRowOutputStream>(buf, sample, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,27 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Formats/TabSeparatedRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** The stream for outputting data in the TSKV format.
|
|
||||||
* TSKV is similar to TabSeparated, but before every value, its name and equal sign are specified: name=value.
|
|
||||||
* This format is very inefficient.
|
|
||||||
*/
|
|
||||||
class TSKVRowOutputStream : public TabSeparatedRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
TSKVRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & format_settings);
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
NamesAndTypes fields;
|
|
||||||
size_t field_number = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Formats/TabSeparatedRowOutputStream.h>
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
struct FormatSettings;
|
|
||||||
|
|
||||||
/** A stream for outputting data in tsv format, but without escaping individual values.
|
|
||||||
* (That is, the output is irreversible.)
|
|
||||||
*/
|
|
||||||
class TabSeparatedRawRowOutputStream : public TabSeparatedRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
TabSeparatedRawRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
|
||||||
: TabSeparatedRowOutputStream(ostr_, sample_, with_names_, with_types_, format_settings_) {}
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override
|
|
||||||
{
|
|
||||||
type.serializeAsText(column, row_num, ostr, format_settings);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,179 +0,0 @@
|
|||||||
#include <Formats/TabSeparatedRowOutputStream.h>
|
|
||||||
#include <Formats/TabSeparatedRawRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
TabSeparatedRowOutputStream::TabSeparatedRowOutputStream(
|
|
||||||
WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_, const FormatSettings & format_settings)
|
|
||||||
: ostr(ostr_), sample(sample_), with_names(with_names_), with_types(with_types_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::flush()
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
size_t columns = sample.columns();
|
|
||||||
|
|
||||||
if (with_names)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
writeEscapedString(sample.safeGetByPosition(i).name, ostr);
|
|
||||||
writeChar(i == columns - 1 ? '\n' : '\t', ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (with_types)
|
|
||||||
{
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
writeEscapedString(sample.safeGetByPosition(i).type->getName(), ostr);
|
|
||||||
writeChar(i == columns - 1 ? '\n' : '\t', ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
type.serializeAsTextEscaped(column, row_num, ostr, format_settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::writeFieldDelimiter()
|
|
||||||
{
|
|
||||||
writeChar('\t', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
writeTotals();
|
|
||||||
writeExtremes();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::writeTotals()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
{
|
|
||||||
size_t columns = totals.columns();
|
|
||||||
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
writeRowStartDelimiter();
|
|
||||||
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
if (j != 0)
|
|
||||||
writeFieldDelimiter();
|
|
||||||
writeField(*totals.getByPosition(j).column.get(), *totals.getByPosition(j).type.get(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeRowEndDelimiter();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void TabSeparatedRowOutputStream::writeExtremes()
|
|
||||||
{
|
|
||||||
if (extremes)
|
|
||||||
{
|
|
||||||
size_t rows = extremes.rows();
|
|
||||||
size_t columns = extremes.columns();
|
|
||||||
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < rows; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeRowBetweenDelimiter();
|
|
||||||
|
|
||||||
writeRowStartDelimiter();
|
|
||||||
|
|
||||||
for (size_t j = 0; j < columns; ++j)
|
|
||||||
{
|
|
||||||
if (j != 0)
|
|
||||||
writeFieldDelimiter();
|
|
||||||
writeField(*extremes.getByPosition(j).column.get(), *extremes.getByPosition(j).type.get(), i);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeRowEndDelimiter();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatTabSeparated(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
for (auto name : {"TabSeparated", "TSV"})
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(name, [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<TabSeparatedRowOutputStream>(buf, sample, false, false, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto name : {"TabSeparatedRaw", "TSVRaw"})
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(name, [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<TabSeparatedRawRowOutputStream>(buf, sample, false, false, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto name : {"TabSeparatedWithNames", "TSVWithNames"})
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(name, [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<TabSeparatedRowOutputStream>(buf, sample, true, false, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"})
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat(name, [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<TabSeparatedRowOutputStream>(buf, sample, true, true, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,51 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class WriteBuffer;
|
|
||||||
|
|
||||||
/** A stream for outputting data in tsv format.
|
|
||||||
*/
|
|
||||||
class TabSeparatedRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
/** with_names - output in the first line a header with column names
|
|
||||||
* with_types - output the next line header with the names of the types
|
|
||||||
*/
|
|
||||||
TabSeparatedRowOutputStream(WriteBuffer & ostr_, const Block & sample_, bool with_names_, bool with_types_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeFieldDelimiter() override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
void writePrefix() override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
|
|
||||||
void flush() override;
|
|
||||||
|
|
||||||
void setTotals(const Block & totals_) override { totals = totals_; }
|
|
||||||
void setExtremes(const Block & extremes_) override { extremes = extremes_; }
|
|
||||||
|
|
||||||
/// https://www.iana.org/assignments/media-types/text/tab-separated-values
|
|
||||||
String getContentType() const override { return "text/tab-separated-values; charset=UTF-8"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void writeTotals();
|
|
||||||
void writeExtremes();
|
|
||||||
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
const Block sample;
|
|
||||||
bool with_names;
|
|
||||||
bool with_types;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
Block totals;
|
|
||||||
Block extremes;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,168 +0,0 @@
|
|||||||
#include <IO/ReadHelpers.h>
|
|
||||||
#include <Interpreters/evaluateConstantExpression.h>
|
|
||||||
#include <Interpreters/Context.h>
|
|
||||||
#include <Interpreters/convertFieldToType.h>
|
|
||||||
#include <Parsers/TokenIterator.h>
|
|
||||||
#include <Parsers/ExpressionListParsers.h>
|
|
||||||
#include <Formats/ValuesRowInputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
|
||||||
#include <Common/FieldVisitors.h>
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Common/typeid_cast.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
namespace ErrorCodes
|
|
||||||
{
|
|
||||||
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
|
|
||||||
extern const int CANNOT_PARSE_QUOTED_STRING;
|
|
||||||
extern const int CANNOT_PARSE_NUMBER;
|
|
||||||
extern const int CANNOT_PARSE_DATE;
|
|
||||||
extern const int CANNOT_PARSE_DATETIME;
|
|
||||||
extern const int CANNOT_READ_ARRAY_FROM_TEXT;
|
|
||||||
extern const int CANNOT_PARSE_DATE;
|
|
||||||
extern const int SYNTAX_ERROR;
|
|
||||||
extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
ValuesRowInputStream::ValuesRowInputStream(ReadBuffer & istr_, const Block & header_, const Context & context_, const FormatSettings & format_settings)
|
|
||||||
: istr(istr_), header(header_), context(std::make_unique<Context>(context_)), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
|
||||||
skipBOMIfExists(istr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool ValuesRowInputStream::read(MutableColumns & columns, RowReadExtension &)
|
|
||||||
{
|
|
||||||
size_t num_columns = columns.size();
|
|
||||||
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
|
|
||||||
if (istr.eof() || *istr.position() == ';')
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/** Typically, this is the usual format for streaming parsing.
|
|
||||||
* But as an exception, it also supports processing arbitrary expressions instead of values.
|
|
||||||
* This is very inefficient. But if there are no expressions, then there is no overhead.
|
|
||||||
*/
|
|
||||||
ParserExpression parser;
|
|
||||||
|
|
||||||
assertChar('(', istr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
|
||||||
{
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
|
|
||||||
char * prev_istr_position = istr.position();
|
|
||||||
size_t prev_istr_bytes = istr.count() - istr.offset();
|
|
||||||
|
|
||||||
bool rollback_on_exception = false;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
header.getByPosition(i).type->deserializeAsTextQuoted(*columns[i], istr, format_settings);
|
|
||||||
rollback_on_exception = true;
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
|
|
||||||
if (i != num_columns - 1)
|
|
||||||
assertChar(',', istr);
|
|
||||||
else
|
|
||||||
assertChar(')', istr);
|
|
||||||
}
|
|
||||||
catch (const Exception & e)
|
|
||||||
{
|
|
||||||
if (!format_settings.values.interpret_expressions)
|
|
||||||
throw;
|
|
||||||
|
|
||||||
/** The normal streaming parser could not parse the value.
|
|
||||||
* Let's try to parse it with a SQL parser as a constant expression.
|
|
||||||
* This is an exceptional case.
|
|
||||||
*/
|
|
||||||
if (e.code() == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED
|
|
||||||
|| e.code() == ErrorCodes::CANNOT_PARSE_QUOTED_STRING
|
|
||||||
|| e.code() == ErrorCodes::CANNOT_PARSE_NUMBER
|
|
||||||
|| e.code() == ErrorCodes::CANNOT_PARSE_DATE
|
|
||||||
|| e.code() == ErrorCodes::CANNOT_PARSE_DATETIME
|
|
||||||
|| e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT)
|
|
||||||
{
|
|
||||||
/// TODO Case when the expression does not fit entirely in the buffer.
|
|
||||||
|
|
||||||
/// If the beginning of the value is no longer in the buffer.
|
|
||||||
if (istr.count() - istr.offset() != prev_istr_bytes)
|
|
||||||
throw;
|
|
||||||
|
|
||||||
if (rollback_on_exception)
|
|
||||||
columns[i]->popBack(1);
|
|
||||||
|
|
||||||
const IDataType & type = *header.getByPosition(i).type;
|
|
||||||
|
|
||||||
Expected expected;
|
|
||||||
|
|
||||||
Tokens tokens(prev_istr_position, istr.buffer().end());
|
|
||||||
TokenIterator token_iterator(tokens);
|
|
||||||
|
|
||||||
ASTPtr ast;
|
|
||||||
if (!parser.parse(token_iterator, ast, expected))
|
|
||||||
throw Exception("Cannot parse expression of type " + type.getName() + " here: "
|
|
||||||
+ String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)),
|
|
||||||
ErrorCodes::SYNTAX_ERROR);
|
|
||||||
|
|
||||||
istr.position() = const_cast<char *>(token_iterator->begin);
|
|
||||||
|
|
||||||
std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(ast, *context);
|
|
||||||
Field value = convertFieldToType(value_raw.first, type, value_raw.second.get());
|
|
||||||
|
|
||||||
/// Check that we are indeed allowed to insert a NULL.
|
|
||||||
if (value.isNull())
|
|
||||||
{
|
|
||||||
if (!type.isNullable())
|
|
||||||
throw Exception{"Expression returns value " + applyVisitor(FieldVisitorToString(), value)
|
|
||||||
+ ", that is out of range of type " + type.getName()
|
|
||||||
+ ", at: " + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)),
|
|
||||||
ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE};
|
|
||||||
}
|
|
||||||
|
|
||||||
columns[i]->insert(value);
|
|
||||||
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
|
|
||||||
if (i != num_columns - 1)
|
|
||||||
assertChar(',', istr);
|
|
||||||
else
|
|
||||||
assertChar(')', istr);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
skipWhitespaceIfAny(istr);
|
|
||||||
if (!istr.eof() && *istr.position() == ',')
|
|
||||||
++istr.position();
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatValues(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerInputFormat("Values", [](
|
|
||||||
ReadBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context & context,
|
|
||||||
UInt64 max_block_size,
|
|
||||||
UInt64 rows_portion_size,
|
|
||||||
FormatFactory::ReadCallback callback,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockInputStreamFromRowInputStream>(
|
|
||||||
std::make_shared<ValuesRowInputStream>(buf, sample, context, settings),
|
|
||||||
sample, max_block_size, rows_portion_size, callback, settings);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,35 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/IRowInputStream.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class Context;
|
|
||||||
class ReadBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** Stream to read data in VALUES format (as in INSERT query).
|
|
||||||
*/
|
|
||||||
class ValuesRowInputStream : public IRowInputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
/** Data is parsed using fast, streaming parser.
|
|
||||||
* If interpret_expressions is true, it will, in addition, try to use SQL parser and interpreter
|
|
||||||
* in case when streaming parser could not parse field (this is very slow).
|
|
||||||
*/
|
|
||||||
ValuesRowInputStream(ReadBuffer & istr_, const Block & header_, const Context & context_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
bool read(MutableColumns & columns, RowReadExtension &) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ReadBuffer & istr;
|
|
||||||
Block header;
|
|
||||||
std::unique_ptr<Context> context; /// pimpl
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,63 +0,0 @@
|
|||||||
#include <Formats/ValuesRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <Columns/IColumn.h>
|
|
||||||
#include <DataTypes/IDataType.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
|
|
||||||
ValuesRowOutputStream::ValuesRowOutputStream(WriteBuffer & ostr_, const FormatSettings & format_settings)
|
|
||||||
: ostr(ostr_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void ValuesRowOutputStream::flush()
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void ValuesRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
type.serializeAsTextQuoted(column, row_num, ostr, format_settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ValuesRowOutputStream::writeFieldDelimiter()
|
|
||||||
{
|
|
||||||
writeChar(',', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ValuesRowOutputStream::writeRowStartDelimiter()
|
|
||||||
{
|
|
||||||
writeChar('(', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ValuesRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeChar(')', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ValuesRowOutputStream::writeRowBetweenDelimiter()
|
|
||||||
{
|
|
||||||
writeCString(",", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatValues(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("Values", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<ValuesRowOutputStream>(buf, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,33 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class WriteBuffer;
|
|
||||||
|
|
||||||
|
|
||||||
/** A stream for outputting data in the VALUES format (as in the INSERT request).
|
|
||||||
*/
|
|
||||||
class ValuesRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
ValuesRowOutputStream(WriteBuffer & ostr_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeFieldDelimiter() override;
|
|
||||||
void writeRowStartDelimiter() override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
void writeRowBetweenDelimiter() override;
|
|
||||||
void flush() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,184 +0,0 @@
|
|||||||
#include <cmath>
|
|
||||||
|
|
||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <IO/WriteBufferFromString.h>
|
|
||||||
#include <Formats/VerticalRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
#include <Common/UTF8Helpers.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
VerticalRowOutputStream::VerticalRowOutputStream(
|
|
||||||
WriteBuffer & ostr_, const Block & sample_, const FormatSettings & format_settings)
|
|
||||||
: ostr(ostr_), sample(sample_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
size_t columns = sample.columns();
|
|
||||||
|
|
||||||
using Widths = std::vector<size_t>;
|
|
||||||
Widths name_widths(columns);
|
|
||||||
size_t max_name_width = 0;
|
|
||||||
|
|
||||||
String serialized_value;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
/// Note that number of code points is just a rough approximation of visible string width.
|
|
||||||
const String & name = sample.getByPosition(i).name;
|
|
||||||
|
|
||||||
name_widths[i] = UTF8::computeWidth(reinterpret_cast<const UInt8 *>(name.data()), name.size());
|
|
||||||
|
|
||||||
if (name_widths[i] > max_name_width)
|
|
||||||
max_name_width = name_widths[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
names_and_paddings.resize(columns);
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
WriteBufferFromString out(names_and_paddings[i]);
|
|
||||||
writeString(sample.getByPosition(i).name, out);
|
|
||||||
writeCString(": ", out);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
size_t new_size = max_name_width - name_widths[i] + names_and_paddings[i].size();
|
|
||||||
names_and_paddings[i].resize(new_size, ' ');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::flush()
|
|
||||||
{
|
|
||||||
ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
if (row_number > format_settings.pretty.max_rows)
|
|
||||||
return;
|
|
||||||
|
|
||||||
writeString(names_and_paddings[field_number], ostr);
|
|
||||||
writeValue(column, type, row_num);
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
|
|
||||||
++field_number;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeValue(const IColumn & column, const IDataType & type, size_t row_num) const
|
|
||||||
{
|
|
||||||
type.serializeAsText(column, row_num, ostr, format_settings);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeRowStartDelimiter()
|
|
||||||
{
|
|
||||||
++row_number;
|
|
||||||
|
|
||||||
if (row_number > format_settings.pretty.max_rows)
|
|
||||||
return;
|
|
||||||
|
|
||||||
writeCString("Row ", ostr);
|
|
||||||
writeIntText(row_number, ostr);
|
|
||||||
writeCString(":\n", ostr);
|
|
||||||
|
|
||||||
size_t width = log10(row_number + 1) + 1 + strlen("Row :");
|
|
||||||
for (size_t i = 0; i < width; ++i)
|
|
||||||
writeCString("─", ostr);
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeRowBetweenDelimiter()
|
|
||||||
{
|
|
||||||
if (row_number > format_settings.pretty.max_rows)
|
|
||||||
return;
|
|
||||||
|
|
||||||
writeCString("\n", ostr);
|
|
||||||
field_number = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
if (row_number > format_settings.pretty.max_rows)
|
|
||||||
{
|
|
||||||
writeCString("Showed first ", ostr);
|
|
||||||
writeIntText(format_settings.pretty.max_rows, ostr);
|
|
||||||
writeCString(".\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (totals || extremes)
|
|
||||||
{
|
|
||||||
writeCString("\n", ostr);
|
|
||||||
writeTotals();
|
|
||||||
writeExtremes();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeSpecialRow(const Block & block, size_t row_num, const char * title)
|
|
||||||
{
|
|
||||||
writeCString("\n", ostr);
|
|
||||||
|
|
||||||
row_number = 0;
|
|
||||||
field_number = 0;
|
|
||||||
|
|
||||||
size_t columns = block.columns();
|
|
||||||
|
|
||||||
writeCString(title, ostr);
|
|
||||||
writeCString(":\n", ostr);
|
|
||||||
|
|
||||||
size_t width = strlen(title) + 1;
|
|
||||||
for (size_t i = 0; i < width; ++i)
|
|
||||||
writeCString("─", ostr);
|
|
||||||
writeChar('\n', ostr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < columns; ++i)
|
|
||||||
{
|
|
||||||
if (i != 0)
|
|
||||||
writeFieldDelimiter();
|
|
||||||
|
|
||||||
auto & col = block.getByPosition(i);
|
|
||||||
writeField(*col.column, *col.type, row_num);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeTotals()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
{
|
|
||||||
writeSpecialRow(totals, 0, "Totals");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void VerticalRowOutputStream::writeExtremes()
|
|
||||||
{
|
|
||||||
if (extremes)
|
|
||||||
{
|
|
||||||
writeSpecialRow(extremes, 0, "Min");
|
|
||||||
writeSpecialRow(extremes, 1, "Max");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatVertical(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("Vertical", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<VerticalRowOutputStream>(buf, sample, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,55 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
class WriteBuffer;
|
|
||||||
class Context;
|
|
||||||
|
|
||||||
|
|
||||||
/** Stream to output data in format "each value in separate row".
|
|
||||||
* Usable to show few rows with many columns.
|
|
||||||
*/
|
|
||||||
class VerticalRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
VerticalRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeRowStartDelimiter() override;
|
|
||||||
void writeRowBetweenDelimiter() override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
|
|
||||||
void flush() override;
|
|
||||||
|
|
||||||
void setTotals(const Block & totals_) override { totals = totals_; }
|
|
||||||
void setExtremes(const Block & extremes_) override { extremes = extremes_; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual void writeValue(const IColumn & column, const IDataType & type, size_t row_num) const;
|
|
||||||
|
|
||||||
void writeTotals();
|
|
||||||
void writeExtremes();
|
|
||||||
/// For totals and extremes.
|
|
||||||
void writeSpecialRow(const Block & block, size_t row_num, const char * title);
|
|
||||||
|
|
||||||
WriteBuffer & ostr;
|
|
||||||
const Block sample;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
size_t field_number = 0;
|
|
||||||
size_t row_number = 0;
|
|
||||||
|
|
||||||
using NamesAndPaddings = std::vector<String>;
|
|
||||||
NamesAndPaddings names_and_paddings;
|
|
||||||
|
|
||||||
Block totals;
|
|
||||||
Block extremes;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,240 +0,0 @@
|
|||||||
#include <IO/WriteHelpers.h>
|
|
||||||
#include <IO/WriteBufferValidUTF8.h>
|
|
||||||
#include <Formats/XMLRowOutputStream.h>
|
|
||||||
#include <Formats/FormatFactory.h>
|
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
XMLRowOutputStream::XMLRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & format_settings)
|
|
||||||
: dst_ostr(ostr_), format_settings(format_settings)
|
|
||||||
{
|
|
||||||
NamesAndTypesList columns(sample_.getNamesAndTypesList());
|
|
||||||
fields.assign(columns.begin(), columns.end());
|
|
||||||
field_tag_names.resize(sample_.columns());
|
|
||||||
|
|
||||||
bool need_validate_utf8 = false;
|
|
||||||
for (size_t i = 0; i < sample_.columns(); ++i)
|
|
||||||
{
|
|
||||||
if (!sample_.getByPosition(i).type->textCanContainOnlyValidUTF8())
|
|
||||||
need_validate_utf8 = true;
|
|
||||||
|
|
||||||
/// As element names, we will use the column name if it has a valid form, or "field", otherwise.
|
|
||||||
/// The condition below is more strict than the XML standard requires.
|
|
||||||
bool is_column_name_suitable = true;
|
|
||||||
const char * begin = fields[i].name.data();
|
|
||||||
const char * end = begin + fields[i].name.size();
|
|
||||||
for (const char * pos = begin; pos != end; ++pos)
|
|
||||||
{
|
|
||||||
char c = *pos;
|
|
||||||
if (!(isAlphaASCII(c)
|
|
||||||
|| (pos != begin && isNumericASCII(c))
|
|
||||||
|| c == '_'
|
|
||||||
|| c == '-'
|
|
||||||
|| c == '.'))
|
|
||||||
{
|
|
||||||
is_column_name_suitable = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
field_tag_names[i] = is_column_name_suitable
|
|
||||||
? fields[i].name
|
|
||||||
: "field";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (need_validate_utf8)
|
|
||||||
{
|
|
||||||
validating_ostr = std::make_unique<WriteBufferValidUTF8>(dst_ostr);
|
|
||||||
ostr = validating_ostr.get();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
ostr = &dst_ostr;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writePrefix()
|
|
||||||
{
|
|
||||||
writeCString("<?xml version='1.0' encoding='UTF-8' ?>\n", *ostr);
|
|
||||||
writeCString("<result>\n", *ostr);
|
|
||||||
writeCString("\t<meta>\n", *ostr);
|
|
||||||
writeCString("\t\t<columns>\n", *ostr);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < fields.size(); ++i)
|
|
||||||
{
|
|
||||||
writeCString("\t\t\t<column>\n", *ostr);
|
|
||||||
|
|
||||||
writeCString("\t\t\t\t<name>", *ostr);
|
|
||||||
writeXMLString(fields[i].name, *ostr);
|
|
||||||
writeCString("</name>\n", *ostr);
|
|
||||||
writeCString("\t\t\t\t<type>", *ostr);
|
|
||||||
writeXMLString(fields[i].type->getName(), *ostr);
|
|
||||||
writeCString("</type>\n", *ostr);
|
|
||||||
|
|
||||||
writeCString("\t\t\t</column>\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeCString("\t\t</columns>\n", *ostr);
|
|
||||||
writeCString("\t</meta>\n", *ostr);
|
|
||||||
writeCString("\t<data>\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeField(const IColumn & column, const IDataType & type, size_t row_num)
|
|
||||||
{
|
|
||||||
writeCString("\t\t\t<", *ostr);
|
|
||||||
writeString(field_tag_names[field_number], *ostr);
|
|
||||||
writeCString(">", *ostr);
|
|
||||||
type.serializeAsTextXML(column, row_num, *ostr, format_settings);
|
|
||||||
writeCString("</", *ostr);
|
|
||||||
writeString(field_tag_names[field_number], *ostr);
|
|
||||||
writeCString(">\n", *ostr);
|
|
||||||
++field_number;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeRowStartDelimiter()
|
|
||||||
{
|
|
||||||
writeCString("\t\t<row>\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeRowEndDelimiter()
|
|
||||||
{
|
|
||||||
writeCString("\t\t</row>\n", *ostr);
|
|
||||||
field_number = 0;
|
|
||||||
++row_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeSuffix()
|
|
||||||
{
|
|
||||||
writeCString("\t</data>\n", *ostr);
|
|
||||||
|
|
||||||
writeTotals();
|
|
||||||
writeExtremes();
|
|
||||||
|
|
||||||
writeCString("\t<rows>", *ostr);
|
|
||||||
writeIntText(row_count, *ostr);
|
|
||||||
writeCString("</rows>\n", *ostr);
|
|
||||||
|
|
||||||
writeRowsBeforeLimitAtLeast();
|
|
||||||
|
|
||||||
if (format_settings.write_statistics)
|
|
||||||
writeStatistics();
|
|
||||||
|
|
||||||
writeCString("</result>\n", *ostr);
|
|
||||||
ostr->next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeRowsBeforeLimitAtLeast()
|
|
||||||
{
|
|
||||||
if (applied_limit)
|
|
||||||
{
|
|
||||||
writeCString("\t<rows_before_limit_at_least>", *ostr);
|
|
||||||
writeIntText(rows_before_limit, *ostr);
|
|
||||||
writeCString("</rows_before_limit_at_least>\n", *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeTotals()
|
|
||||||
{
|
|
||||||
if (totals)
|
|
||||||
{
|
|
||||||
writeCString("\t<totals>\n", *ostr);
|
|
||||||
|
|
||||||
size_t totals_columns = totals.columns();
|
|
||||||
for (size_t i = 0; i < totals_columns; ++i)
|
|
||||||
{
|
|
||||||
const ColumnWithTypeAndName & column = totals.safeGetByPosition(i);
|
|
||||||
|
|
||||||
writeCString("\t\t<", *ostr);
|
|
||||||
writeString(field_tag_names[i], *ostr);
|
|
||||||
writeCString(">", *ostr);
|
|
||||||
column.type->serializeAsTextXML(*column.column.get(), 0, *ostr, format_settings);
|
|
||||||
writeCString("</", *ostr);
|
|
||||||
writeString(field_tag_names[i], *ostr);
|
|
||||||
writeCString(">\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeCString("\t</totals>\n", *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void writeExtremesElement(
|
|
||||||
const char * title, const Block & extremes, size_t row_num, const Names & field_tag_names, WriteBuffer & ostr, const FormatSettings & format_settings)
|
|
||||||
{
|
|
||||||
writeCString("\t\t<", ostr);
|
|
||||||
writeCString(title, ostr);
|
|
||||||
writeCString(">\n", ostr);
|
|
||||||
|
|
||||||
size_t extremes_columns = extremes.columns();
|
|
||||||
for (size_t i = 0; i < extremes_columns; ++i)
|
|
||||||
{
|
|
||||||
const ColumnWithTypeAndName & column = extremes.safeGetByPosition(i);
|
|
||||||
|
|
||||||
writeCString("\t\t\t<", ostr);
|
|
||||||
writeString(field_tag_names[i], ostr);
|
|
||||||
writeCString(">", ostr);
|
|
||||||
column.type->serializeAsTextXML(*column.column.get(), row_num, ostr, format_settings);
|
|
||||||
writeCString("</", ostr);
|
|
||||||
writeString(field_tag_names[i], ostr);
|
|
||||||
writeCString(">\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
writeCString("\t\t</", ostr);
|
|
||||||
writeCString(title, ostr);
|
|
||||||
writeCString(">\n", ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeExtremes()
|
|
||||||
{
|
|
||||||
if (extremes)
|
|
||||||
{
|
|
||||||
writeCString("\t<extremes>\n", *ostr);
|
|
||||||
writeExtremesElement("min", extremes, 0, field_tag_names, *ostr, format_settings);
|
|
||||||
writeExtremesElement("max", extremes, 1, field_tag_names, *ostr, format_settings);
|
|
||||||
writeCString("\t</extremes>\n", *ostr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void XMLRowOutputStream::onProgress(const Progress & value)
|
|
||||||
{
|
|
||||||
progress.incrementPiecewiseAtomically(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void XMLRowOutputStream::writeStatistics()
|
|
||||||
{
|
|
||||||
writeCString("\t<statistics>\n", *ostr);
|
|
||||||
writeCString("\t\t<elapsed>", *ostr);
|
|
||||||
writeText(watch.elapsedSeconds(), *ostr);
|
|
||||||
writeCString("</elapsed>\n", *ostr);
|
|
||||||
writeCString("\t\t<rows_read>", *ostr);
|
|
||||||
writeText(progress.read_rows.load(), *ostr);
|
|
||||||
writeCString("</rows_read>\n", *ostr);
|
|
||||||
writeCString("\t\t<bytes_read>", *ostr);
|
|
||||||
writeText(progress.read_bytes.load(), *ostr);
|
|
||||||
writeCString("</bytes_read>\n", *ostr);
|
|
||||||
writeCString("\t</statistics>\n", *ostr);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void registerOutputFormatXML(FormatFactory & factory)
|
|
||||||
{
|
|
||||||
factory.registerOutputFormat("XML", [](
|
|
||||||
WriteBuffer & buf,
|
|
||||||
const Block & sample,
|
|
||||||
const Context &,
|
|
||||||
const FormatSettings & settings)
|
|
||||||
{
|
|
||||||
return std::make_shared<BlockOutputStreamFromRowOutputStream>(
|
|
||||||
std::make_shared<XMLRowOutputStream>(buf, sample, settings), sample);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,74 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Core/Block.h>
|
|
||||||
#include <IO/Progress.h>
|
|
||||||
#include <IO/WriteBuffer.h>
|
|
||||||
#include <Common/Stopwatch.h>
|
|
||||||
#include <Formats/FormatSettings.h>
|
|
||||||
#include <Formats/IRowOutputStream.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
|
||||||
{
|
|
||||||
|
|
||||||
/** A stream for outputting data in XML format.
|
|
||||||
*/
|
|
||||||
class XMLRowOutputStream : public IRowOutputStream
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
XMLRowOutputStream(WriteBuffer & ostr_, const Block & sample_, const FormatSettings & format_settings);
|
|
||||||
|
|
||||||
void writeField(const IColumn & column, const IDataType & type, size_t row_num) override;
|
|
||||||
void writeRowStartDelimiter() override;
|
|
||||||
void writeRowEndDelimiter() override;
|
|
||||||
void writePrefix() override;
|
|
||||||
void writeSuffix() override;
|
|
||||||
|
|
||||||
void flush() override
|
|
||||||
{
|
|
||||||
ostr->next();
|
|
||||||
|
|
||||||
if (validating_ostr)
|
|
||||||
dst_ostr.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
void setRowsBeforeLimit(size_t rows_before_limit_) override
|
|
||||||
{
|
|
||||||
applied_limit = true;
|
|
||||||
rows_before_limit = rows_before_limit_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setTotals(const Block & totals_) override { totals = totals_; }
|
|
||||||
void setExtremes(const Block & extremes_) override { extremes = extremes_; }
|
|
||||||
|
|
||||||
void onProgress(const Progress & value) override;
|
|
||||||
|
|
||||||
String getContentType() const override { return "application/xml; charset=UTF-8"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
|
|
||||||
void writeRowsBeforeLimitAtLeast();
|
|
||||||
virtual void writeTotals();
|
|
||||||
virtual void writeExtremes();
|
|
||||||
void writeStatistics();
|
|
||||||
|
|
||||||
WriteBuffer & dst_ostr;
|
|
||||||
std::unique_ptr<WriteBuffer> validating_ostr; /// Validates UTF-8 sequences, replaces bad sequences with replacement character.
|
|
||||||
WriteBuffer * ostr;
|
|
||||||
|
|
||||||
size_t field_number = 0;
|
|
||||||
size_t row_count = 0;
|
|
||||||
bool applied_limit = false;
|
|
||||||
size_t rows_before_limit = 0;
|
|
||||||
NamesAndTypes fields;
|
|
||||||
Names field_tag_names;
|
|
||||||
Block totals;
|
|
||||||
Block extremes;
|
|
||||||
|
|
||||||
Progress progress;
|
|
||||||
Stopwatch watch;
|
|
||||||
const FormatSettings format_settings;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -13,11 +13,12 @@
|
|||||||
#include <DataTypes/DataTypeString.h>
|
#include <DataTypes/DataTypeString.h>
|
||||||
|
|
||||||
#include <Formats/TabSeparatedRowInputStream.h>
|
#include <Formats/TabSeparatedRowInputStream.h>
|
||||||
#include <Formats/TabSeparatedRowOutputStream.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
||||||
|
|
||||||
#include <DataStreams/copyData.h>
|
#include <DataStreams/copyData.h>
|
||||||
|
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
|
||||||
|
#include <Processors/Formats/OutputStreamToOutputFormat.h>
|
||||||
|
|
||||||
|
|
||||||
int main(int, char **)
|
int main(int, char **)
|
||||||
@ -46,10 +47,9 @@ try
|
|||||||
|
|
||||||
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings);
|
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings);
|
||||||
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
|
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
|
||||||
RowOutputStreamPtr row_output = std::make_shared<TabSeparatedRowOutputStream>(out_buf, sample, false, false, format_settings);
|
BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>(std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, format_settings));
|
||||||
BlockOutputStreamFromRowOutputStream block_output(row_output, sample);
|
|
||||||
|
|
||||||
copyData(block_input, block_output);
|
copyData(block_input, *block_output);
|
||||||
}
|
}
|
||||||
catch (const DB::Exception & e)
|
catch (const DB::Exception & e)
|
||||||
{
|
{
|
||||||
|
@ -10,11 +10,12 @@
|
|||||||
#include <DataTypes/DataTypeString.h>
|
#include <DataTypes/DataTypeString.h>
|
||||||
|
|
||||||
#include <Formats/TabSeparatedRowInputStream.h>
|
#include <Formats/TabSeparatedRowInputStream.h>
|
||||||
#include <Formats/TabSeparatedRowOutputStream.h>
|
|
||||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
||||||
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
#include <Formats/BlockOutputStreamFromRowOutputStream.h>
|
||||||
|
|
||||||
#include <DataStreams/copyData.h>
|
#include <DataStreams/copyData.h>
|
||||||
|
#include <Processors/Formats/OutputStreamToOutputFormat.h>
|
||||||
|
#include <Processors/Formats/Impl/TabSeparatedRowOutputFormat.h>
|
||||||
|
|
||||||
|
|
||||||
using namespace DB;
|
using namespace DB;
|
||||||
@ -40,12 +41,12 @@ try
|
|||||||
FormatSettings format_settings;
|
FormatSettings format_settings;
|
||||||
|
|
||||||
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings);
|
RowInputStreamPtr row_input = std::make_shared<TabSeparatedRowInputStream>(in_buf, sample, false, false, format_settings);
|
||||||
RowOutputStreamPtr row_output = std::make_shared<TabSeparatedRowOutputStream>(out_buf, sample, false, false, format_settings);
|
|
||||||
|
|
||||||
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
|
BlockInputStreamFromRowInputStream block_input(row_input, sample, DEFAULT_INSERT_BLOCK_SIZE, 0, []{}, format_settings);
|
||||||
BlockOutputStreamFromRowOutputStream block_output(row_output, sample);
|
|
||||||
|
|
||||||
copyData(block_input, block_output);
|
BlockOutputStreamPtr block_output = std::make_shared<OutputStreamToOutputFormat>(
|
||||||
|
std::make_shared<TabSeparatedRowOutputFormat>(out_buf, sample, false, false, format_settings));
|
||||||
|
|
||||||
|
copyData(block_input, *block_output);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
|
@ -1632,6 +1632,7 @@ private:
|
|||||||
using ToDataType = DataTypeDecimal<FieldType>;
|
using ToDataType = DataTypeDecimal<FieldType>;
|
||||||
|
|
||||||
TypeIndex type_index = from_type->getTypeId();
|
TypeIndex type_index = from_type->getTypeId();
|
||||||
|
UInt32 precision = to_type->getPrecision();
|
||||||
UInt32 scale = to_type->getScale();
|
UInt32 scale = to_type->getScale();
|
||||||
|
|
||||||
WhichDataType which(type_index);
|
WhichDataType which(type_index);
|
||||||
@ -1645,9 +1646,9 @@ private:
|
|||||||
throw Exception{"Conversion from " + from_type->getName() + " to " + to_type->getName() + " is not supported",
|
throw Exception{"Conversion from " + from_type->getName() + " to " + to_type->getName() + " is not supported",
|
||||||
ErrorCodes::CANNOT_CONVERT_TYPE};
|
ErrorCodes::CANNOT_CONVERT_TYPE};
|
||||||
|
|
||||||
return [type_index, scale] (Block & block, const ColumnNumbers & arguments, const size_t result, size_t input_rows_count)
|
return [type_index, precision, scale] (Block & block, const ColumnNumbers & arguments, const size_t result, size_t input_rows_count)
|
||||||
{
|
{
|
||||||
callOnIndexAndDataType<ToDataType>(type_index, [&](const auto & types) -> bool
|
auto res = callOnIndexAndDataType<ToDataType>(type_index, [&](const auto & types) -> bool
|
||||||
{
|
{
|
||||||
using Types = std::decay_t<decltype(types)>;
|
using Types = std::decay_t<decltype(types)>;
|
||||||
using LeftDataType = typename Types::LeftType;
|
using LeftDataType = typename Types::LeftType;
|
||||||
@ -1656,6 +1657,14 @@ private:
|
|||||||
ConvertImpl<LeftDataType, RightDataType, NameCast>::execute(block, arguments, result, input_rows_count, scale);
|
ConvertImpl<LeftDataType, RightDataType, NameCast>::execute(block, arguments, result, input_rows_count, scale);
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/// Additionally check if callOnIndexAndDataType wasn't called at all.
|
||||||
|
if (!res)
|
||||||
|
{
|
||||||
|
auto to = DataTypeDecimal<FieldType>(precision, scale);
|
||||||
|
throw Exception{"Conversion from " + std::string(getTypeName(type_index)) + " to " + to.getName() +
|
||||||
|
" is not supported", ErrorCodes::CANNOT_CONVERT_TYPE};
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2022,6 +2031,11 @@ private:
|
|||||||
|
|
||||||
const auto & tmp_res = tmp_block.getByPosition(tmp_res_index);
|
const auto & tmp_res = tmp_block.getByPosition(tmp_res_index);
|
||||||
|
|
||||||
|
/// May happen in fuzzy tests. For debug purpose.
|
||||||
|
if (!tmp_res.column)
|
||||||
|
throw Exception("Couldn't convert " + block.getByPosition(arguments[0]).type->getName() + " to "
|
||||||
|
+ nested_type->getName() + " in " + " prepareRemoveNullable wrapper.", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
|
||||||
res.column = wrapInNullable(tmp_res.column, Block({block.getByPosition(arguments[0]), tmp_res}), {0}, 1, input_rows_count);
|
res.column = wrapInNullable(tmp_res.column, Block({block.getByPosition(arguments[0]), tmp_res}), {0}, 1, input_rows_count);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -36,6 +36,10 @@ const UInt8 geohash_base32_decode_lookup_table[256] = {
|
|||||||
const size_t BITS_PER_SYMBOL = 5;
|
const size_t BITS_PER_SYMBOL = 5;
|
||||||
const size_t MAX_PRECISION = 12;
|
const size_t MAX_PRECISION = 12;
|
||||||
const size_t MAX_BITS = MAX_PRECISION * BITS_PER_SYMBOL * 1.5;
|
const size_t MAX_BITS = MAX_PRECISION * BITS_PER_SYMBOL * 1.5;
|
||||||
|
const Float64 LON_MIN = -180;
|
||||||
|
const Float64 LON_MAX = 180;
|
||||||
|
const Float64 LAT_MIN = -90;
|
||||||
|
const Float64 LAT_MAX = 90;
|
||||||
|
|
||||||
using Encoded = std::array<UInt8, MAX_BITS>;
|
using Encoded = std::array<UInt8, MAX_BITS>;
|
||||||
|
|
||||||
@ -62,9 +66,9 @@ inline Encoded encodeCoordinate(Float64 coord, Float64 min, Float64 max, UInt8 b
|
|||||||
Encoded result;
|
Encoded result;
|
||||||
result.fill(0);
|
result.fill(0);
|
||||||
|
|
||||||
for (int i = 0; i < bits; ++i)
|
for (size_t i = 0; i < bits; ++i)
|
||||||
{
|
{
|
||||||
Float64 mid = (max + min) / 2;
|
const Float64 mid = (max + min) / 2;
|
||||||
if (coord >= mid)
|
if (coord >= mid)
|
||||||
{
|
{
|
||||||
result[i] = 1;
|
result[i] = 1;
|
||||||
@ -83,7 +87,7 @@ inline Encoded encodeCoordinate(Float64 coord, Float64 min, Float64 max, UInt8 b
|
|||||||
inline Float64 decodeCoordinate(const Encoded & coord, Float64 min, Float64 max, UInt8 bits)
|
inline Float64 decodeCoordinate(const Encoded & coord, Float64 min, Float64 max, UInt8 bits)
|
||||||
{
|
{
|
||||||
Float64 mid = (max + min) / 2;
|
Float64 mid = (max + min) / 2;
|
||||||
for (int i = 0; i < bits; ++i)
|
for (size_t i = 0; i < bits; ++i)
|
||||||
{
|
{
|
||||||
const auto c = coord[i];
|
const auto c = coord[i];
|
||||||
if (c == 1)
|
if (c == 1)
|
||||||
@ -148,7 +152,7 @@ inline void base32Encode(const Encoded & binary, UInt8 precision, char * out)
|
|||||||
{
|
{
|
||||||
extern const char geohash_base32_encode_lookup_table[32];
|
extern const char geohash_base32_encode_lookup_table[32];
|
||||||
|
|
||||||
for (UInt8 i = 0; i < precision * BITS_PER_SYMBOL; i += 5)
|
for (UInt8 i = 0; i < precision * BITS_PER_SYMBOL; i += BITS_PER_SYMBOL)
|
||||||
{
|
{
|
||||||
UInt8 v = binary[i];
|
UInt8 v = binary[i];
|
||||||
v <<= 1;
|
v <<= 1;
|
||||||
@ -187,24 +191,38 @@ inline Encoded base32Decode(const char * encoded_string, size_t encoded_length)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline Float64 getMaxSpan(CoordType type)
|
||||||
|
{
|
||||||
|
if (type == LONGITUDE)
|
||||||
|
{
|
||||||
|
return LON_MAX - LON_MIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
return LAT_MAX - LAT_MIN;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace DB
|
inline Float64 getSpan(UInt8 precision, CoordType type)
|
||||||
{
|
{
|
||||||
|
const auto bits = singleCoordBitsPrecision(precision, type);
|
||||||
|
// since every bit of precision divides span by 2, divide max span by 2^bits.
|
||||||
|
return ldexp(getMaxSpan(type), -1 * bits);
|
||||||
|
}
|
||||||
|
|
||||||
namespace GeoUtils
|
inline UInt8 geohashPrecision(UInt8 precision)
|
||||||
{
|
|
||||||
|
|
||||||
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char *& out)
|
|
||||||
{
|
{
|
||||||
if (precision == 0 || precision > MAX_PRECISION)
|
if (precision == 0 || precision > MAX_PRECISION)
|
||||||
{
|
{
|
||||||
precision = MAX_PRECISION;
|
precision = MAX_PRECISION;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return precision;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline size_t geohashEncodeImpl(Float64 longitude, Float64 latitude, UInt8 precision, char * out)
|
||||||
|
{
|
||||||
const Encoded combined = merge(
|
const Encoded combined = merge(
|
||||||
encodeCoordinate(longitude, -180, 180, singleCoordBitsPrecision(precision, LONGITUDE)),
|
encodeCoordinate(longitude, LON_MIN, LON_MAX, singleCoordBitsPrecision(precision, LONGITUDE)),
|
||||||
encodeCoordinate(latitude, -90, 90, singleCoordBitsPrecision(precision, LATITUDE)),
|
encodeCoordinate(latitude, LAT_MIN, LAT_MAX, singleCoordBitsPrecision(precision, LATITUDE)),
|
||||||
precision);
|
precision);
|
||||||
|
|
||||||
base32Encode(combined, precision, out);
|
base32Encode(combined, precision, out);
|
||||||
@ -212,9 +230,28 @@ size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char
|
|||||||
return precision;
|
return precision;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace GeoUtils
|
||||||
|
{
|
||||||
|
|
||||||
|
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char * out)
|
||||||
|
{
|
||||||
|
precision = geohashPrecision(precision);
|
||||||
|
return geohashEncodeImpl(longitude, latitude, precision, out);
|
||||||
|
}
|
||||||
|
|
||||||
void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * longitude, Float64 * latitude)
|
void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * longitude, Float64 * latitude)
|
||||||
{
|
{
|
||||||
const UInt8 precision = std::min(encoded_len, MAX_PRECISION);
|
const UInt8 precision = std::min(encoded_len, static_cast<size_t>(MAX_PRECISION));
|
||||||
if (precision == 0)
|
if (precision == 0)
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
@ -223,8 +260,89 @@ void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * lo
|
|||||||
Encoded lat_encoded, lon_encoded;
|
Encoded lat_encoded, lon_encoded;
|
||||||
std::tie(lon_encoded, lat_encoded) = split(base32Decode(encoded_string, precision), precision);
|
std::tie(lon_encoded, lat_encoded) = split(base32Decode(encoded_string, precision), precision);
|
||||||
|
|
||||||
*longitude = decodeCoordinate(lon_encoded, -180, 180, singleCoordBitsPrecision(precision, LONGITUDE));
|
*longitude = decodeCoordinate(lon_encoded, LON_MIN, LON_MAX, singleCoordBitsPrecision(precision, LONGITUDE));
|
||||||
*latitude = decodeCoordinate(lat_encoded, -90, 90, singleCoordBitsPrecision(precision, LATITUDE));
|
*latitude = decodeCoordinate(lat_encoded, LAT_MIN, LAT_MAX, singleCoordBitsPrecision(precision, LATITUDE));
|
||||||
|
}
|
||||||
|
|
||||||
|
GeohashesInBoxPreparedArgs geohashesInBoxPrepare(const Float64 longitude_min,
|
||||||
|
const Float64 latitude_min,
|
||||||
|
const Float64 longitude_max,
|
||||||
|
const Float64 latitude_max,
|
||||||
|
UInt8 precision)
|
||||||
|
{
|
||||||
|
precision = geohashPrecision(precision);
|
||||||
|
|
||||||
|
if (longitude_max < longitude_min || latitude_max < latitude_min)
|
||||||
|
{
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto lon_step = getSpan(precision, LONGITUDE);
|
||||||
|
const auto lat_step = getSpan(precision, LATITUDE);
|
||||||
|
|
||||||
|
// align max to the right(or up) border of geohash grid cell to ensure that cell is in result.
|
||||||
|
Float64 lon_min = floor(longitude_min / lon_step) * lon_step;
|
||||||
|
Float64 lat_min = floor(latitude_min / lat_step) * lat_step;
|
||||||
|
Float64 lon_max = ceil(longitude_max / lon_step) * lon_step;
|
||||||
|
Float64 lat_max = ceil(latitude_max / lat_step) * lat_step;
|
||||||
|
|
||||||
|
const auto lon_span = lon_max - lon_min;
|
||||||
|
const auto lat_span = lat_max - lat_min;
|
||||||
|
// in case of a very small (or zero) span, produce at least 1 item.
|
||||||
|
const auto items_count = std::max(size_t{1}, static_cast<size_t>(ceil(lon_span/lon_step * lat_span/lat_step)));
|
||||||
|
|
||||||
|
return GeohashesInBoxPreparedArgs{
|
||||||
|
items_count,
|
||||||
|
precision,
|
||||||
|
lon_min,
|
||||||
|
lat_min,
|
||||||
|
lon_max,
|
||||||
|
lat_max,
|
||||||
|
lon_step,
|
||||||
|
lat_step
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & args, char * out)
|
||||||
|
{
|
||||||
|
if (args.items_count == 0
|
||||||
|
|| args.precision == 0
|
||||||
|
|| args.precision > MAX_PRECISION
|
||||||
|
|| args.latitude_min > args.latitude_max
|
||||||
|
|| args.longitude_min > args.longitude_max
|
||||||
|
|| args.longitude_step <= 0
|
||||||
|
|| args.latitude_step <= 0)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
UInt64 items = 0;
|
||||||
|
for (auto lon = args.longitude_min; lon < args.longitude_max; lon += args.longitude_step)
|
||||||
|
{
|
||||||
|
for (auto lat = args.latitude_min; lat < args.latitude_max; lat += args.latitude_step)
|
||||||
|
{
|
||||||
|
assert(items <= args.items_count);
|
||||||
|
|
||||||
|
size_t l = geohashEncodeImpl(lon, lat, args.precision, out);
|
||||||
|
out += l;
|
||||||
|
*out = '\0';
|
||||||
|
++out;
|
||||||
|
|
||||||
|
++items;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (items == 0 && args.items_count != 0)
|
||||||
|
{
|
||||||
|
size_t l = geohashEncodeImpl(args.longitude_min, args.latitude_min, args.precision, out);
|
||||||
|
out += l;
|
||||||
|
*out = '\0';
|
||||||
|
++out;
|
||||||
|
|
||||||
|
++items;
|
||||||
|
}
|
||||||
|
|
||||||
|
return items;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -706,10 +706,33 @@ std::string serialize(Polygon && polygon)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char *& out);
|
size_t geohashEncode(Float64 longitude, Float64 latitude, UInt8 precision, char * out);
|
||||||
|
|
||||||
void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * longitude, Float64 * latitude);
|
void geohashDecode(const char * encoded_string, size_t encoded_len, Float64 * longitude, Float64 * latitude);
|
||||||
|
|
||||||
|
std::vector<std::pair<Float64, Float64>> geohashCoverBox(Float64 longitude_min, Float64 latitude_min, Float64 longitude_max, Float64 latitude_max, UInt8 precision, UInt32 max_items = 0);
|
||||||
|
|
||||||
|
struct GeohashesInBoxPreparedArgs
|
||||||
|
{
|
||||||
|
UInt64 items_count = 0;
|
||||||
|
UInt8 precision = 0;
|
||||||
|
|
||||||
|
Float64 longitude_min = 0.0;
|
||||||
|
Float64 latitude_min = 0.0;
|
||||||
|
Float64 longitude_max = 0.0;
|
||||||
|
Float64 latitude_max = 0.0;
|
||||||
|
|
||||||
|
Float64 longitude_step = 0.0;
|
||||||
|
Float64 latitude_step = 0.0;
|
||||||
|
};
|
||||||
|
|
||||||
|
GeohashesInBoxPreparedArgs geohashesInBoxPrepare(const Float64 longitude_min,
|
||||||
|
const Float64 latitude_min,
|
||||||
|
Float64 longitude_max,
|
||||||
|
Float64 latitude_max,
|
||||||
|
UInt8 precision);
|
||||||
|
|
||||||
|
UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & estimation, char * out);
|
||||||
|
|
||||||
} /// GeoUtils
|
} /// GeoUtils
|
||||||
|
|
||||||
|
169
dbms/src/Functions/geohashesInBox.cpp
Normal file
169
dbms/src/Functions/geohashesInBox.cpp
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
#include <Functions/IFunction.h>
|
||||||
|
#include <Functions/FunctionFactory.h>
|
||||||
|
#include <Functions/FunctionHelpers.h>
|
||||||
|
#include <Functions/GeoUtils.h>
|
||||||
|
|
||||||
|
#include <Columns/ColumnArray.h>
|
||||||
|
#include <Columns/ColumnString.h>
|
||||||
|
#include <DataTypes/DataTypeArray.h>
|
||||||
|
#include <DataTypes/DataTypeString.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int LOGICAL_ERROR;
|
||||||
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||||
|
extern const int TOO_LARGE_ARRAY_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
class FunctionGeohashesInBox : public IFunction
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static constexpr auto name = "geohashesInBox";
|
||||||
|
static FunctionPtr create(const Context &) { return std::make_shared<FunctionGeohashesInBox>(); }
|
||||||
|
|
||||||
|
String getName() const override { return name; }
|
||||||
|
|
||||||
|
size_t getNumberOfArguments() const override { return 5; }
|
||||||
|
|
||||||
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||||
|
{
|
||||||
|
validateArgumentType(*this, arguments, 0, isFloat, "float");
|
||||||
|
validateArgumentType(*this, arguments, 1, isFloat, "float");
|
||||||
|
validateArgumentType(*this, arguments, 2, isFloat, "float");
|
||||||
|
validateArgumentType(*this, arguments, 3, isFloat, "float");
|
||||||
|
validateArgumentType(*this, arguments, 4, isUInt8, "integer");
|
||||||
|
|
||||||
|
if (!(arguments[0]->equals(*arguments[1]) &&
|
||||||
|
arguments[0]->equals(*arguments[2]) &&
|
||||||
|
arguments[0]->equals(*arguments[3])))
|
||||||
|
{
|
||||||
|
throw Exception("Illegal type of argument of " + getName() +
|
||||||
|
" all coordinate arguments must have the same type, instead they are:" +
|
||||||
|
arguments[0]->getName() + ", " +
|
||||||
|
arguments[1]->getName() + ", " +
|
||||||
|
arguments[2]->getName() + ", " +
|
||||||
|
arguments[3]->getName() + ".",
|
||||||
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool useDefaultImplementationForConstants() const override { return true; }
|
||||||
|
|
||||||
|
template <typename LonAndLatType, typename PrecisionType>
|
||||||
|
void execute(const IColumn * lon_min_column,
|
||||||
|
const IColumn * lat_min_column,
|
||||||
|
const IColumn * lon_max_column,
|
||||||
|
const IColumn * lat_max_column,
|
||||||
|
const IColumn * precision_column,
|
||||||
|
ColumnPtr & result)
|
||||||
|
{
|
||||||
|
static constexpr size_t max_array_size = 10'000'000;
|
||||||
|
|
||||||
|
const auto * lon_min = checkAndGetColumn<ColumnVector<LonAndLatType>>(lon_min_column);
|
||||||
|
const auto * lat_min = checkAndGetColumn<ColumnVector<LonAndLatType>>(lat_min_column);
|
||||||
|
const auto * lon_max = checkAndGetColumn<ColumnVector<LonAndLatType>>(lon_max_column);
|
||||||
|
const auto * lat_max = checkAndGetColumn<ColumnVector<LonAndLatType>>(lat_max_column);
|
||||||
|
auto * precision = checkAndGetColumn<ColumnVector<PrecisionType>>(precision_column);
|
||||||
|
if (precision == nullptr)
|
||||||
|
{
|
||||||
|
precision = checkAndGetColumnConstData<ColumnVector<PrecisionType>>(precision_column);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!lon_min || !lat_min || !lon_max || !lat_max || !precision)
|
||||||
|
{
|
||||||
|
throw Exception("Unsupported argument types for function " + getName() + " : " +
|
||||||
|
lon_min_column->getName() + ", " +
|
||||||
|
lat_min_column->getName() + ", " +
|
||||||
|
lon_max_column->getName() + ", " +
|
||||||
|
lat_max_column->getName() + ".",
|
||||||
|
ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t total_rows = lat_min->size();
|
||||||
|
|
||||||
|
auto col_res = ColumnArray::create(ColumnString::create());
|
||||||
|
ColumnString & res_strings = typeid_cast<ColumnString &>(col_res->getData());
|
||||||
|
ColumnArray::Offsets & res_offsets = col_res->getOffsets();
|
||||||
|
ColumnString::Chars & res_strings_chars = res_strings.getChars();
|
||||||
|
ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets();
|
||||||
|
|
||||||
|
for (size_t row = 0; row < total_rows; ++row)
|
||||||
|
{
|
||||||
|
const Float64 lon_min_value = lon_min->getElement(row);
|
||||||
|
const Float64 lat_min_value = lat_min->getElement(row);
|
||||||
|
const Float64 lon_max_value = lon_max->getElement(row);
|
||||||
|
const Float64 lat_max_value = lat_max->getElement(row);
|
||||||
|
|
||||||
|
const auto prepared_args = GeoUtils::geohashesInBoxPrepare(
|
||||||
|
lon_min_value, lat_min_value, lon_max_value, lat_max_value,
|
||||||
|
precision->getElement(row % precision->size()));
|
||||||
|
if (prepared_args.items_count > max_array_size)
|
||||||
|
{
|
||||||
|
throw Exception(getName() + " would produce " + std::to_string(prepared_args.items_count) +
|
||||||
|
" array elements, which is bigger than the allowed maximum of " + std::to_string(max_array_size),
|
||||||
|
ErrorCodes::TOO_LARGE_ARRAY_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
res_strings_offsets.reserve(res_strings_offsets.size() + prepared_args.items_count);
|
||||||
|
res_strings_chars.resize(res_strings_chars.size() + prepared_args.items_count * (prepared_args.precision + 1));
|
||||||
|
const auto starting_offset = res_strings_offsets.empty() ? 0 : res_strings_offsets.back();
|
||||||
|
char * out = reinterpret_cast<char *>(res_strings_chars.data() + starting_offset);
|
||||||
|
|
||||||
|
// Actually write geohashes into preallocated buffer.
|
||||||
|
GeoUtils::geohashesInBox(prepared_args, out);
|
||||||
|
|
||||||
|
for (UInt8 i = 1; i <= prepared_args.items_count ; ++i)
|
||||||
|
{
|
||||||
|
res_strings_offsets.push_back(starting_offset + (prepared_args.precision + 1) * i);
|
||||||
|
}
|
||||||
|
res_offsets.push_back((res_offsets.empty() ? 0 : res_offsets.back()) + prepared_args.items_count);
|
||||||
|
}
|
||||||
|
if (!res_strings_offsets.empty() && res_strings_offsets.back() != res_strings_chars.size())
|
||||||
|
{
|
||||||
|
throw Exception("String column size mismatch (internal logical error)", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!res_offsets.empty() && res_offsets.back() != res_strings.size())
|
||||||
|
{
|
||||||
|
throw Exception("Arrary column size mismatch (internal logical error)" +
|
||||||
|
std::to_string(res_offsets.back()) + " != " + std::to_string(res_strings.size()),
|
||||||
|
ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
result = std::move(col_res);
|
||||||
|
}
|
||||||
|
|
||||||
|
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
|
||||||
|
{
|
||||||
|
const IColumn * lon_min = block.getByPosition(arguments[0]).column.get();
|
||||||
|
const IColumn * lat_min = block.getByPosition(arguments[1]).column.get();
|
||||||
|
const IColumn * lon_max = block.getByPosition(arguments[2]).column.get();
|
||||||
|
const IColumn * lat_max = block.getByPosition(arguments[3]).column.get();
|
||||||
|
const IColumn * prec = block.getByPosition(arguments[4]).column.get();
|
||||||
|
ColumnPtr & res = block.getByPosition(result).column;
|
||||||
|
|
||||||
|
if (checkColumn<ColumnVector<Float32>>(lon_min))
|
||||||
|
{
|
||||||
|
execute<Float32, UInt8>(lon_min, lat_min, lon_max, lat_max, prec, res);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
execute<Float64, UInt8>(lon_min, lat_min, lon_max, lat_max, prec, res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void registerFunctionGeohashesInBox(FunctionFactory & factory)
|
||||||
|
{
|
||||||
|
factory.registerFunction<FunctionGeohashesInBox>();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -10,6 +10,7 @@ void registerFunctionPointInEllipses(FunctionFactory & factory);
|
|||||||
void registerFunctionPointInPolygon(FunctionFactory & factory);
|
void registerFunctionPointInPolygon(FunctionFactory & factory);
|
||||||
void registerFunctionGeohashEncode(FunctionFactory & factory);
|
void registerFunctionGeohashEncode(FunctionFactory & factory);
|
||||||
void registerFunctionGeohashDecode(FunctionFactory & factory);
|
void registerFunctionGeohashDecode(FunctionFactory & factory);
|
||||||
|
void registerFunctionGeohashesInBox(FunctionFactory & factory);
|
||||||
|
|
||||||
#if USE_H3
|
#if USE_H3
|
||||||
void registerFunctionGeoToH3(FunctionFactory &);
|
void registerFunctionGeoToH3(FunctionFactory &);
|
||||||
@ -22,6 +23,7 @@ void registerFunctionsGeo(FunctionFactory & factory)
|
|||||||
registerFunctionPointInPolygon(factory);
|
registerFunctionPointInPolygon(factory);
|
||||||
registerFunctionGeohashEncode(factory);
|
registerFunctionGeohashEncode(factory);
|
||||||
registerFunctionGeohashDecode(factory);
|
registerFunctionGeohashDecode(factory);
|
||||||
|
registerFunctionGeohashesInBox(factory);
|
||||||
|
|
||||||
#if USE_H3
|
#if USE_H3
|
||||||
registerFunctionGeoToH3(factory);
|
registerFunctionGeoToH3(factory);
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <Functions/FunctionFactory.h>
|
#include <Functions/FunctionFactory.h>
|
||||||
#include <Functions/FunctionHelpers.h>
|
#include <Functions/FunctionHelpers.h>
|
||||||
|
#include <Columns/ColumnString.h>
|
||||||
#include <Columns/ColumnsNumber.h>
|
#include <Columns/ColumnsNumber.h>
|
||||||
#include <Columns/ColumnsCommon.h>
|
#include <Columns/ColumnsCommon.h>
|
||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
|
#include <IO/WriteHelpers.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -13,6 +15,7 @@ namespace ErrorCodes
|
|||||||
{
|
{
|
||||||
extern const int ILLEGAL_COLUMN;
|
extern const int ILLEGAL_COLUMN;
|
||||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||||
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||||
extern const int FUNCTION_THROW_IF_VALUE_IS_NON_ZERO;
|
extern const int FUNCTION_THROW_IF_VALUE_IS_NON_ZERO;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -32,46 +35,70 @@ public:
|
|||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isVariadic() const override { return true; }
|
||||||
size_t getNumberOfArguments() const override
|
size_t getNumberOfArguments() const override
|
||||||
{
|
{
|
||||||
return 1;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||||
{
|
{
|
||||||
if (!isNativeNumber(arguments.front()))
|
const size_t number_of_arguments = arguments.size();
|
||||||
|
|
||||||
|
if (number_of_arguments < 1 || number_of_arguments > 2)
|
||||||
|
throw Exception{"Number of arguments for function " + getName() + " doesn't match: passed "
|
||||||
|
+ toString(number_of_arguments) + ", should be 1 or 2",
|
||||||
|
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH};
|
||||||
|
|
||||||
|
if (!isNativeNumber(arguments[0]))
|
||||||
throw Exception{"Argument for function " + getName() + " must be number", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
throw Exception{"Argument for function " + getName() + " must be number", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||||
|
|
||||||
|
if (number_of_arguments > 1 && !isString(arguments[1]))
|
||||||
|
throw Exception{"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(),
|
||||||
|
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||||
|
|
||||||
|
|
||||||
return std::make_shared<DataTypeUInt8>();
|
return std::make_shared<DataTypeUInt8>();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool useDefaultImplementationForConstants() const override { return true; }
|
bool useDefaultImplementationForConstants() const override { return true; }
|
||||||
|
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||||
|
|
||||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
|
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
|
||||||
{
|
{
|
||||||
|
std::optional<String> custom_message;
|
||||||
|
if (arguments.size() == 2)
|
||||||
|
{
|
||||||
|
auto * msg_column = checkAndGetColumnConst<ColumnString>(block.getByPosition(arguments[1]).column.get());
|
||||||
|
if (!msg_column)
|
||||||
|
throw Exception{"Second argument for function " + getName() + " must be constant String", ErrorCodes::ILLEGAL_COLUMN};
|
||||||
|
custom_message = msg_column->getValue<String>();
|
||||||
|
}
|
||||||
|
|
||||||
const auto in = block.getByPosition(arguments.front()).column.get();
|
const auto in = block.getByPosition(arguments.front()).column.get();
|
||||||
|
|
||||||
if ( !execute<UInt8>(block, in, result)
|
if ( !execute<UInt8>(block, in, result, custom_message)
|
||||||
&& !execute<UInt16>(block, in, result)
|
&& !execute<UInt16>(block, in, result, custom_message)
|
||||||
&& !execute<UInt32>(block, in, result)
|
&& !execute<UInt32>(block, in, result, custom_message)
|
||||||
&& !execute<UInt64>(block, in, result)
|
&& !execute<UInt64>(block, in, result, custom_message)
|
||||||
&& !execute<Int8>(block, in, result)
|
&& !execute<Int8>(block, in, result, custom_message)
|
||||||
&& !execute<Int16>(block, in, result)
|
&& !execute<Int16>(block, in, result, custom_message)
|
||||||
&& !execute<Int32>(block, in, result)
|
&& !execute<Int32>(block, in, result, custom_message)
|
||||||
&& !execute<Int64>(block, in, result)
|
&& !execute<Int64>(block, in, result, custom_message)
|
||||||
&& !execute<Float32>(block, in, result)
|
&& !execute<Float32>(block, in, result, custom_message)
|
||||||
&& !execute<Float64>(block, in, result))
|
&& !execute<Float64>(block, in, result, custom_message))
|
||||||
throw Exception{"Illegal column " + in->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
|
throw Exception{"Illegal column " + in->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool execute(Block & block, const IColumn * in_untyped, const size_t result)
|
bool execute(Block & block, const IColumn * in_untyped, const size_t result, const std::optional<String> & message)
|
||||||
{
|
{
|
||||||
if (const auto in = checkAndGetColumn<ColumnVector<T>>(in_untyped))
|
if (const auto in = checkAndGetColumn<ColumnVector<T>>(in_untyped))
|
||||||
{
|
{
|
||||||
const auto & in_data = in->getData();
|
const auto & in_data = in->getData();
|
||||||
if (!memoryIsZero(in_data.data(), in_data.size() * sizeof(in_data[0])))
|
if (!memoryIsZero(in_data.data(), in_data.size() * sizeof(in_data[0])))
|
||||||
throw Exception("Value passed to 'throwIf' function is non zero", ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO);
|
throw Exception{message.value_or("Value passed to '" + getName() + "' function is non zero"),
|
||||||
|
ErrorCodes::FUNCTION_THROW_IF_VALUE_IS_NON_ZERO};
|
||||||
|
|
||||||
/// We return non constant to avoid constant folding.
|
/// We return non constant to avoid constant folding.
|
||||||
block.getByPosition(result).column = ColumnUInt8::create(in_data.size(), 0);
|
block.getByPosition(result).column = ColumnUInt8::create(in_data.size(), 0);
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <streambuf>
|
#include <streambuf>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
@ -1040,23 +1040,35 @@ void ExpressionAnalyzer::collectUsedColumns()
|
|||||||
/// You need to read at least one column to find the number of rows.
|
/// You need to read at least one column to find the number of rows.
|
||||||
if (select_query && required.empty())
|
if (select_query && required.empty())
|
||||||
{
|
{
|
||||||
/// We will find a column with minimum compressed size. Because it is the column that is cheapest to read.
|
/// We will find a column with minimum <compressed_size, type_size, uncompressed_size>.
|
||||||
size_t min_data_compressed = 0;
|
/// Because it is the column that is cheapest to read.
|
||||||
String min_column_name;
|
struct ColumnSizeTuple
|
||||||
|
{
|
||||||
|
size_t compressed_size;
|
||||||
|
size_t type_size;
|
||||||
|
size_t uncompressed_size;
|
||||||
|
String name;
|
||||||
|
bool operator<(const ColumnSizeTuple & that) const
|
||||||
|
{
|
||||||
|
return std::tie(compressed_size, type_size, uncompressed_size)
|
||||||
|
< std::tie(that.compressed_size, that.type_size, that.uncompressed_size);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
std::vector<ColumnSizeTuple> columns;
|
||||||
if (storage)
|
if (storage)
|
||||||
{
|
{
|
||||||
auto column_sizes = storage->getColumnSizes();
|
auto column_sizes = storage->getColumnSizes();
|
||||||
for (auto & [column_name, column_size] : column_sizes)
|
for (auto & source_column : source_columns)
|
||||||
{
|
{
|
||||||
if (min_data_compressed == 0 || min_data_compressed > column_size.data_compressed)
|
auto c = column_sizes.find(source_column.name);
|
||||||
{
|
if (c == column_sizes.end())
|
||||||
min_data_compressed = column_size.data_compressed;
|
continue;
|
||||||
min_column_name = column_name;
|
size_t type_size = source_column.type->haveMaximumSizeOfValue() ? source_column.type->getMaximumSizeOfValueInMemory() : 100;
|
||||||
}
|
columns.emplace_back(ColumnSizeTuple{c->second.data_compressed, type_size, c->second.data_uncompressed, source_column.name});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (min_data_compressed > 0)
|
if (columns.size())
|
||||||
required.insert(min_column_name);
|
required.insert(std::min_element(columns.begin(), columns.end())->name);
|
||||||
else
|
else
|
||||||
/// If we have no information about columns sizes, choose a column of minimum size of its data type.
|
/// If we have no information about columns sizes, choose a column of minimum size of its data type.
|
||||||
required.insert(ExpressionActions::getSmallestColumn(source_columns));
|
required.insert(ExpressionActions::getSmallestColumn(source_columns));
|
||||||
|
@ -26,7 +26,7 @@ namespace DB
|
|||||||
|
|
||||||
namespace ErrorCodes
|
namespace ErrorCodes
|
||||||
{
|
{
|
||||||
extern const int UNKNOWN_SET_DATA_VARIANT;
|
extern const int UNSUPPORTED_JOIN_KEYS;
|
||||||
extern const int LOGICAL_ERROR;
|
extern const int LOGICAL_ERROR;
|
||||||
extern const int SET_SIZE_LIMIT_EXCEEDED;
|
extern const int SET_SIZE_LIMIT_EXCEEDED;
|
||||||
extern const int TYPE_MISMATCH;
|
extern const int TYPE_MISMATCH;
|
||||||
@ -770,7 +770,7 @@ IColumn::Filter switchJoinRightColumns(
|
|||||||
#undef M
|
#undef M
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
|
throw Exception("Unsupported JOIN keys. Type: " + toString(static_cast<UInt32>(type)), ErrorCodes::UNSUPPORTED_JOIN_KEYS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1350,7 +1350,8 @@ private:
|
|||||||
APPLY_FOR_JOIN_VARIANTS(M)
|
APPLY_FOR_JOIN_VARIANTS(M)
|
||||||
#undef M
|
#undef M
|
||||||
default:
|
default:
|
||||||
throw Exception("Unknown JOIN keys variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
|
throw Exception("Unsupported JOIN keys. Type: " + toString(static_cast<UInt32>(parent.type)),
|
||||||
|
ErrorCodes::UNSUPPORTED_JOIN_KEYS);
|
||||||
}
|
}
|
||||||
|
|
||||||
__builtin_unreachable();
|
__builtin_unreachable();
|
||||||
|
@ -8,7 +8,6 @@ namespace DB
|
|||||||
|
|
||||||
namespace ErrorCodes
|
namespace ErrorCodes
|
||||||
{
|
{
|
||||||
extern const int UNKNOWN_SET_DATA_VARIANT;
|
|
||||||
extern const int LOGICAL_ERROR;
|
extern const int LOGICAL_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,9 +185,12 @@ void PipelineExecutor::expandPipeline(Stack & stack, UInt64 pid)
|
|||||||
graph.emplace_back(processor.get(), graph.size());
|
graph.emplace_back(processor.get(), graph.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
processors.insert(processors.end(), new_processors.begin(), new_processors.end());
|
{
|
||||||
UInt64 num_processors = processors.size();
|
std::lock_guard guard(processors_mutex);
|
||||||
|
processors.insert(processors.end(), new_processors.begin(), new_processors.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
UInt64 num_processors = processors.size();
|
||||||
for (UInt64 node = 0; node < num_processors; ++node)
|
for (UInt64 node = 0; node < num_processors; ++node)
|
||||||
{
|
{
|
||||||
if (addEdges(node))
|
if (addEdges(node))
|
||||||
@ -374,6 +377,16 @@ void PipelineExecutor::doExpandPipeline(ExpandPipelineTask * task, bool processi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PipelineExecutor::cancel()
|
||||||
|
{
|
||||||
|
cancelled = true;
|
||||||
|
finish();
|
||||||
|
|
||||||
|
std::lock_guard guard(processors_mutex);
|
||||||
|
for (auto & processor : processors)
|
||||||
|
processor->cancel();
|
||||||
|
}
|
||||||
|
|
||||||
void PipelineExecutor::finish()
|
void PipelineExecutor::finish()
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
|
@ -35,14 +35,11 @@ public:
|
|||||||
const Processors & getProcessors() const { return processors; }
|
const Processors & getProcessors() const { return processors; }
|
||||||
|
|
||||||
/// Cancel execution. May be called from another thread.
|
/// Cancel execution. May be called from another thread.
|
||||||
void cancel()
|
void cancel();
|
||||||
{
|
|
||||||
cancelled = true;
|
|
||||||
finish();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Processors & processors;
|
Processors & processors;
|
||||||
|
std::mutex processors_mutex;
|
||||||
|
|
||||||
struct Edge
|
struct Edge
|
||||||
{
|
{
|
||||||
@ -75,8 +72,8 @@ private:
|
|||||||
std::exception_ptr exception;
|
std::exception_ptr exception;
|
||||||
std::function<void()> job;
|
std::function<void()> job;
|
||||||
|
|
||||||
IProcessor * processor;
|
IProcessor * processor = nullptr;
|
||||||
UInt64 processors_id;
|
UInt64 processors_id = 0;
|
||||||
|
|
||||||
/// Counters for profiling.
|
/// Counters for profiling.
|
||||||
size_t num_executed_jobs = 0;
|
size_t num_executed_jobs = 0;
|
||||||
|
@ -27,6 +27,12 @@ public:
|
|||||||
: ISource(std::move(header)), in(in)
|
: ISource(std::move(header)), in(in)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual const BlockMissingValues & getMissingValues() const
|
||||||
|
{
|
||||||
|
static const BlockMissingValues none;
|
||||||
|
return none;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -58,6 +58,17 @@ public:
|
|||||||
virtual std::string getContentType() const { return "text/plain; charset=UTF-8"; }
|
virtual std::string getContentType() const { return "text/plain; charset=UTF-8"; }
|
||||||
|
|
||||||
InputPort & getPort(PortKind kind) { return *std::next(inputs.begin(), kind); }
|
InputPort & getPort(PortKind kind) { return *std::next(inputs.begin(), kind); }
|
||||||
|
|
||||||
|
public:
|
||||||
|
/// Compatible to IBlockOutputStream interface
|
||||||
|
|
||||||
|
void write(const Block & block) { consume(Chunk(block.getColumns(), block.rows())); }
|
||||||
|
|
||||||
|
virtual void doWritePrefix() {}
|
||||||
|
virtual void doWriteSuffix() { finalize(); }
|
||||||
|
|
||||||
|
void setTotals(const Block & totals) { consumeTotals(Chunk(totals.getColumns(), totals.rows())); }
|
||||||
|
void setExtremes(const Block & extremes) { consumeExtremes(Chunk(extremes.getColumns(), extremes.rows())); }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include <Processors/Formats/IRowInputFormat.h>
|
#include <Processors/Formats/IRowInputFormat.h>
|
||||||
#include <IO/WriteHelpers.h> // toString
|
#include <IO/WriteHelpers.h> // toString
|
||||||
|
#include <common/logger_useful.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -16,6 +17,7 @@ namespace ErrorCodes
|
|||||||
extern const int CANNOT_PARSE_UUID;
|
extern const int CANNOT_PARSE_UUID;
|
||||||
extern const int TOO_LARGE_STRING_SIZE;
|
extern const int TOO_LARGE_STRING_SIZE;
|
||||||
extern const int INCORRECT_NUMBER_OF_COLUMNS;
|
extern const int INCORRECT_NUMBER_OF_COLUMNS;
|
||||||
|
extern const int TIMEOUT_EXCEEDED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -32,6 +34,33 @@ static bool isParseError(int code)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool handleOverflowMode(OverflowMode mode, const String & message, int code)
|
||||||
|
{
|
||||||
|
switch (mode)
|
||||||
|
{
|
||||||
|
case OverflowMode::THROW:
|
||||||
|
throw Exception(message, code);
|
||||||
|
case OverflowMode::BREAK:
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
throw Exception("Logical error: unknown overflow mode", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool checkTimeLimit(const IRowInputFormat::Params & params, const Stopwatch & stopwatch)
|
||||||
|
{
|
||||||
|
if (params.max_execution_time != 0
|
||||||
|
&& stopwatch.elapsed() > static_cast<UInt64>(params.max_execution_time.totalMicroseconds()) * 1000)
|
||||||
|
return handleOverflowMode(params.timeout_overflow_mode,
|
||||||
|
"Timeout exceeded: elapsed " + toString(stopwatch.elapsedSeconds())
|
||||||
|
+ " seconds, maximum: " + toString(params.max_execution_time.totalMicroseconds() / 1000000.0),
|
||||||
|
ErrorCodes::TIMEOUT_EXCEEDED);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Chunk IRowInputFormat::generate()
|
Chunk IRowInputFormat::generate()
|
||||||
{
|
{
|
||||||
if (total_rows == 0)
|
if (total_rows == 0)
|
||||||
@ -43,12 +72,19 @@ Chunk IRowInputFormat::generate()
|
|||||||
MutableColumns columns = header.cloneEmptyColumns();
|
MutableColumns columns = header.cloneEmptyColumns();
|
||||||
size_t prev_rows = total_rows;
|
size_t prev_rows = total_rows;
|
||||||
|
|
||||||
auto chunk_missing_values = std::make_unique<ChunkMissingValues>();
|
///auto chunk_missing_values = std::make_unique<ChunkMissingValues>();
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
for (size_t rows = 0; rows < params.max_block_size; ++rows)
|
for (size_t rows = 0, batch = 0; rows < params.max_block_size; ++rows, ++batch)
|
||||||
{
|
{
|
||||||
|
if (params.rows_portion_size && batch == params.rows_portion_size)
|
||||||
|
{
|
||||||
|
batch = 0;
|
||||||
|
if (!checkTimeLimit(params, total_stopwatch) || isCancelled())
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
++total_rows;
|
++total_rows;
|
||||||
@ -56,6 +92,8 @@ Chunk IRowInputFormat::generate()
|
|||||||
RowReadExtension info;
|
RowReadExtension info;
|
||||||
if (!readRow(columns, info))
|
if (!readRow(columns, info))
|
||||||
break;
|
break;
|
||||||
|
if (params.callback)
|
||||||
|
params.callback();
|
||||||
|
|
||||||
for (size_t column_idx = 0; column_idx < info.read_columns.size(); ++column_idx)
|
for (size_t column_idx = 0; column_idx < info.read_columns.size(); ++column_idx)
|
||||||
{
|
{
|
||||||
@ -64,7 +102,7 @@ Chunk IRowInputFormat::generate()
|
|||||||
size_t column_size = columns[column_idx]->size();
|
size_t column_size = columns[column_idx]->size();
|
||||||
if (column_size == 0)
|
if (column_size == 0)
|
||||||
throw Exception("Unexpected empty column", ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS);
|
throw Exception("Unexpected empty column", ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS);
|
||||||
chunk_missing_values->setBit(column_idx, column_size - 1);
|
block_missing_values.setBit(column_idx, column_size - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -134,12 +172,18 @@ Chunk IRowInputFormat::generate()
|
|||||||
|
|
||||||
if (columns.empty() || columns[0]->empty())
|
if (columns.empty() || columns[0]->empty())
|
||||||
{
|
{
|
||||||
|
if (params.allow_errors_num > 0 || params.allow_errors_ratio > 0)
|
||||||
|
{
|
||||||
|
Logger * log = &Logger::get("BlockInputStreamFromRowInputStream");
|
||||||
|
LOG_TRACE(log, "Skipped " << num_errors << " rows with errors while reading the input stream");
|
||||||
|
}
|
||||||
|
|
||||||
readSuffix();
|
readSuffix();
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk chunk(std::move(columns), total_rows - prev_rows);
|
Chunk chunk(std::move(columns), total_rows - prev_rows);
|
||||||
chunk.setChunkInfo(std::move(chunk_missing_values));
|
//chunk.setChunkInfo(std::move(chunk_missing_values));
|
||||||
return chunk;
|
return chunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,9 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <Columns/IColumn.h>
|
#include <Columns/IColumn.h>
|
||||||
#include <Processors/Formats/IInputFormat.h>
|
#include <Processors/Formats/IInputFormat.h>
|
||||||
|
#include <DataStreams/SizeLimits.h>
|
||||||
|
#include <Poco/Timespan.h>
|
||||||
|
#include <Common/Stopwatch.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -23,6 +26,14 @@ struct RowInputFormatParams
|
|||||||
|
|
||||||
UInt64 allow_errors_num;
|
UInt64 allow_errors_num;
|
||||||
Float64 allow_errors_ratio;
|
Float64 allow_errors_ratio;
|
||||||
|
|
||||||
|
UInt64 rows_portion_size;
|
||||||
|
|
||||||
|
using ReadCallback = std::function<void()>;
|
||||||
|
ReadCallback callback;
|
||||||
|
|
||||||
|
Poco::Timespan max_execution_time = 0;
|
||||||
|
OverflowMode timeout_overflow_mode = OverflowMode::THROW;
|
||||||
};
|
};
|
||||||
|
|
||||||
///Row oriented input format: reads data row by row.
|
///Row oriented input format: reads data row by row.
|
||||||
@ -61,11 +72,16 @@ protected:
|
|||||||
/// If not implemented, returns empty string.
|
/// If not implemented, returns empty string.
|
||||||
virtual std::string getDiagnosticInfo() { return {}; }
|
virtual std::string getDiagnosticInfo() { return {}; }
|
||||||
|
|
||||||
|
const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Params params;
|
Params params;
|
||||||
|
Stopwatch total_stopwatch {CLOCK_MONOTONIC_COARSE};
|
||||||
|
|
||||||
size_t total_rows = 0;
|
size_t total_rows = 0;
|
||||||
size_t num_errors = 0;
|
size_t num_errors = 0;
|
||||||
|
|
||||||
|
BlockMissingValues block_missing_values;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include <Formats/verbosePrintString.h>
|
#include <Formats/verbosePrintString.h>
|
||||||
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
|
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
|
||||||
#include <Formats/FormatFactory.h>
|
#include <Formats/FormatFactory.h>
|
||||||
|
#include <DataTypes/DataTypeNullable.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -18,16 +19,66 @@ namespace ErrorCodes
|
|||||||
|
|
||||||
CSVRowInputFormat::CSVRowInputFormat(
|
CSVRowInputFormat::CSVRowInputFormat(
|
||||||
ReadBuffer & in_, Block header, Params params, bool with_names_, const FormatSettings & format_settings)
|
ReadBuffer & in_, Block header, Params params, bool with_names_, const FormatSettings & format_settings)
|
||||||
: IRowInputFormat(std::move(header), in_, params), with_names(with_names_), format_settings(format_settings)
|
: IRowInputFormat(std::move(header), in_, std::move(params))
|
||||||
|
, with_names(with_names_)
|
||||||
|
, format_settings(format_settings)
|
||||||
{
|
{
|
||||||
auto & sample = getPort().getHeader();
|
auto & sample = getPort().getHeader();
|
||||||
size_t num_columns = sample.columns();
|
size_t num_columns = sample.columns();
|
||||||
|
|
||||||
data_types.resize(num_columns);
|
data_types.resize(num_columns);
|
||||||
|
column_indexes_by_names.reserve(num_columns);
|
||||||
|
column_idx_to_nullable_column_idx.resize(num_columns);
|
||||||
|
|
||||||
for (size_t i = 0; i < num_columns; ++i)
|
for (size_t i = 0; i < num_columns; ++i)
|
||||||
data_types[i] = sample.safeGetByPosition(i).type;
|
{
|
||||||
|
const auto & column_info = sample.getByPosition(i);
|
||||||
|
|
||||||
|
data_types[i] = column_info.type;
|
||||||
|
column_indexes_by_names.emplace(column_info.name, i);
|
||||||
|
|
||||||
|
/// If input_format_null_as_default=1 we need ColumnNullable of type DataTypeNullable(nested_type)
|
||||||
|
/// to parse value as nullable before inserting it in corresponding column of not-nullable type.
|
||||||
|
/// Constructing temporary column for each row is slow, so we prepare it here
|
||||||
|
if (format_settings.csv.null_as_default && !column_info.type->isNullable() && column_info.type->canBeInsideNullable())
|
||||||
|
{
|
||||||
|
column_idx_to_nullable_column_idx[i] = nullable_columns.size();
|
||||||
|
nullable_types.emplace_back(std::make_shared<DataTypeNullable>(column_info.type));
|
||||||
|
nullable_columns.emplace_back(nullable_types.back()->createColumn());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Map an input file column to a table column, based on its name.
|
||||||
|
void CSVRowInputFormat::addInputColumn(const String & column_name)
|
||||||
|
{
|
||||||
|
const auto column_it = column_indexes_by_names.find(column_name);
|
||||||
|
if (column_it == column_indexes_by_names.end())
|
||||||
|
{
|
||||||
|
if (format_settings.skip_unknown_fields)
|
||||||
|
{
|
||||||
|
column_indexes_for_input_fields.push_back(std::nullopt);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw Exception(
|
||||||
|
"Unknown field found in CSV header: '" + column_name + "' " +
|
||||||
|
"at position " + std::to_string(column_indexes_for_input_fields.size()) +
|
||||||
|
"\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed",
|
||||||
|
ErrorCodes::INCORRECT_DATA
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto column_index = column_it->second;
|
||||||
|
|
||||||
|
if (read_columns[column_index])
|
||||||
|
throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA);
|
||||||
|
|
||||||
|
read_columns[column_index] = true;
|
||||||
|
column_indexes_for_input_fields.emplace_back(column_index);
|
||||||
|
}
|
||||||
|
|
||||||
static void skipEndOfLine(ReadBuffer & istr)
|
static void skipEndOfLine(ReadBuffer & istr)
|
||||||
{
|
{
|
||||||
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
|
/// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic)
|
||||||
@ -106,28 +157,111 @@ void CSVRowInputFormat::readPrefix()
|
|||||||
|
|
||||||
size_t num_columns = data_types.size();
|
size_t num_columns = data_types.size();
|
||||||
String tmp;
|
String tmp;
|
||||||
|
auto & header = getPort().getHeader();
|
||||||
|
|
||||||
if (with_names)
|
if (with_names)
|
||||||
skipRow(in, format_settings.csv, num_columns);
|
{
|
||||||
|
/// This CSV file has a header row with column names. Depending on the
|
||||||
|
/// settings, use it or skip it.
|
||||||
|
if (format_settings.with_names_use_header)
|
||||||
|
{
|
||||||
|
/// Look at the file header to see which columns we have there.
|
||||||
|
/// The missing columns are filled with defaults.
|
||||||
|
read_columns.assign(header.columns(), false);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
String column_name;
|
||||||
|
skipWhitespacesAndTabs(in);
|
||||||
|
readCSVString(column_name, in, format_settings.csv);
|
||||||
|
skipWhitespacesAndTabs(in);
|
||||||
|
|
||||||
|
addInputColumn(column_name);
|
||||||
|
}
|
||||||
|
while (checkChar(format_settings.csv.delimiter, in));
|
||||||
|
|
||||||
|
skipDelimiter(in, format_settings.csv.delimiter, true);
|
||||||
|
|
||||||
|
for (auto read_column : read_columns)
|
||||||
|
{
|
||||||
|
if (!read_column)
|
||||||
|
{
|
||||||
|
have_always_default_columns = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
skipRow(in, format_settings.csv, num_columns);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The default: map each column of the file to the column of the table with
|
||||||
|
/// the same index.
|
||||||
|
read_columns.assign(header.columns(), true);
|
||||||
|
column_indexes_for_input_fields.resize(header.columns());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i)
|
||||||
|
{
|
||||||
|
column_indexes_for_input_fields[i] = i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
|
bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext)
|
||||||
{
|
{
|
||||||
if (in.eof())
|
if (in.eof())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
updateDiagnosticInfo();
|
updateDiagnosticInfo();
|
||||||
|
|
||||||
size_t size = data_types.size();
|
/// Track whether we have to fill any columns in this row with default
|
||||||
|
/// values. If not, we return an empty column mask to the caller, so that
|
||||||
|
/// it doesn't have to check it.
|
||||||
|
bool have_default_columns = have_always_default_columns;
|
||||||
|
|
||||||
for (size_t i = 0; i < size; ++i)
|
const auto delimiter = format_settings.csv.delimiter;
|
||||||
|
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
||||||
{
|
{
|
||||||
skipWhitespacesAndTabs(in);
|
const auto & table_column = column_indexes_for_input_fields[file_column];
|
||||||
data_types[i]->deserializeAsTextCSV(*columns[i], in, format_settings);
|
const bool is_last_file_column =
|
||||||
skipWhitespacesAndTabs(in);
|
file_column + 1 == column_indexes_for_input_fields.size();
|
||||||
|
|
||||||
skipDelimiter(in, format_settings.csv.delimiter, i + 1 == size);
|
if (table_column)
|
||||||
|
{
|
||||||
|
skipWhitespacesAndTabs(in);
|
||||||
|
read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column],
|
||||||
|
is_last_file_column, *table_column);
|
||||||
|
if (!read_columns[*table_column])
|
||||||
|
have_default_columns = true;
|
||||||
|
skipWhitespacesAndTabs(in);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/// We never read this column from the file, just skip it.
|
||||||
|
String tmp;
|
||||||
|
readCSVString(tmp, in, format_settings.csv);
|
||||||
|
}
|
||||||
|
|
||||||
|
skipDelimiter(in, delimiter, is_last_file_column);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (have_default_columns)
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < read_columns.size(); i++)
|
||||||
|
{
|
||||||
|
if (!read_columns[i])
|
||||||
|
{
|
||||||
|
/// The column value for this row is going to be overwritten
|
||||||
|
/// with default by the caller, but the general assumption is
|
||||||
|
/// that the column size increases for each row, so we have
|
||||||
|
/// to insert something. Since we do not care about the exact
|
||||||
|
/// value, we do not have to use the default value specified by
|
||||||
|
/// the data type, and can just use IColumn::insertDefault().
|
||||||
|
columns[i]->insertDefault();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ext.read_columns = read_columns;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -190,93 +324,126 @@ String CSVRowInputFormat::getDiagnosticInfo()
|
|||||||
return out.str();
|
return out.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** gcc-7 generates wrong code with optimization level greater than 1.
|
||||||
bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
* See tests: dbms/src/IO/tests/write_int.cpp
|
||||||
|
* and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
|
||||||
|
* This is compiler bug. The bug does not present in gcc-8 and clang-8.
|
||||||
|
* Nevertheless, we don't need high optimization of this function.
|
||||||
|
*/
|
||||||
|
bool OPTIMIZE(1) CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
|
||||||
{
|
{
|
||||||
const char delimiter = format_settings.csv.delimiter;
|
const char delimiter = format_settings.csv.delimiter;
|
||||||
auto & header = getPort().getHeader();
|
|
||||||
|
|
||||||
size_t size = data_types.size();
|
for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column)
|
||||||
for (size_t i = 0; i < size; ++i)
|
|
||||||
{
|
{
|
||||||
if (i == 0 && in.eof())
|
if (file_column == 0 && in.eof())
|
||||||
{
|
{
|
||||||
out << "<End of stream>\n";
|
out << "<End of stream>\n";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
out << "Column " << i << ", " << std::string((i < 10 ? 2 : i < 100 ? 1 : 0), ' ')
|
if (column_indexes_for_input_fields[file_column].has_value())
|
||||||
<< "name: " << header.safeGetByPosition(i).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(i).name.size(), ' ')
|
|
||||||
<< "type: " << data_types[i]->getName() << ", " << std::string(max_length_of_data_type_name - data_types[i]->getName().size(), ' ');
|
|
||||||
|
|
||||||
BufferBase::Position prev_position = in.position();
|
|
||||||
BufferBase::Position curr_position = in.position();
|
|
||||||
std::exception_ptr exception;
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
{
|
||||||
skipWhitespacesAndTabs(in);
|
const auto & table_column = *column_indexes_for_input_fields[file_column];
|
||||||
prev_position = in.position();
|
const auto & current_column_type = data_types[table_column];
|
||||||
data_types[i]->deserializeAsTextCSV(*columns[i], in, format_settings);
|
const bool is_last_file_column =
|
||||||
curr_position = in.position();
|
file_column + 1 == column_indexes_for_input_fields.size();
|
||||||
skipWhitespacesAndTabs(in);
|
const bool at_delimiter = *in.position() == delimiter;
|
||||||
}
|
const bool at_last_column_line_end = is_last_file_column
|
||||||
catch (...)
|
&& (*in.position() == '\n' || *in.position() == '\r'
|
||||||
{
|
|| in.eof());
|
||||||
exception = std::current_exception();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (curr_position < prev_position)
|
auto & header = getPort().getHeader();
|
||||||
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
|
||||||
|
<< "name: " << header.safeGetByPosition(table_column).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(table_column).name.size(), ' ')
|
||||||
|
<< "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');
|
||||||
|
|
||||||
if (isNumber(data_types[i]) || isDateOrDateTime(data_types[i]))
|
if (format_settings.csv.empty_as_default
|
||||||
{
|
&& (at_delimiter || at_last_column_line_end))
|
||||||
/// An empty string instead of a value.
|
|
||||||
if (curr_position == prev_position)
|
|
||||||
{
|
{
|
||||||
out << "ERROR: text ";
|
columns[table_column]->insertDefault();
|
||||||
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
|
|
||||||
out << " is not like " << data_types[i]->getName() << "\n";
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
out << "parsed text: ";
|
|
||||||
verbosePrintString(prev_position, curr_position, out);
|
|
||||||
|
|
||||||
if (exception)
|
|
||||||
{
|
|
||||||
if (data_types[i]->getName() == "DateTime")
|
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
|
||||||
else if (data_types[i]->getName() == "Date")
|
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
|
||||||
else
|
else
|
||||||
out << "ERROR\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
out << "\n";
|
|
||||||
|
|
||||||
if (data_types[i]->haveMaximumSizeOfValue())
|
|
||||||
{
|
|
||||||
if (*curr_position != '\n' && *curr_position != '\r' && *curr_position != delimiter)
|
|
||||||
{
|
{
|
||||||
out << "ERROR: garbage after " << data_types[i]->getName() << ": ";
|
BufferBase::Position prev_position = in.position();
|
||||||
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
|
BufferBase::Position curr_position = in.position();
|
||||||
|
std::exception_ptr exception;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
skipWhitespacesAndTabs(in);
|
||||||
|
prev_position = in.position();
|
||||||
|
readField(*columns[table_column], current_column_type, is_last_file_column, table_column);
|
||||||
|
curr_position = in.position();
|
||||||
|
skipWhitespacesAndTabs(in);
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
exception = std::current_exception();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (curr_position < prev_position)
|
||||||
|
throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);
|
||||||
|
|
||||||
|
if (isNativeNumber(current_column_type) || isDateOrDateTime(current_column_type))
|
||||||
|
{
|
||||||
|
/// An empty string instead of a value.
|
||||||
|
if (curr_position == prev_position)
|
||||||
|
{
|
||||||
|
out << "ERROR: text ";
|
||||||
|
verbosePrintString(prev_position, std::min(prev_position + 10, in.buffer().end()), out);
|
||||||
|
out << " is not like " << current_column_type->getName() << "\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out << "parsed text: ";
|
||||||
|
verbosePrintString(prev_position, curr_position, out);
|
||||||
|
|
||||||
|
if (exception)
|
||||||
|
{
|
||||||
|
if (current_column_type->getName() == "DateTime")
|
||||||
|
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
||||||
|
else if (current_column_type->getName() == "Date")
|
||||||
|
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
||||||
|
else
|
||||||
|
out << "ERROR\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
out << "\n";
|
out << "\n";
|
||||||
|
|
||||||
if (data_types[i]->getName() == "DateTime")
|
if (current_column_type->haveMaximumSizeOfValue()
|
||||||
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
&& *curr_position != '\n' && *curr_position != '\r'
|
||||||
else if (data_types[i]->getName() == "Date")
|
&& *curr_position != delimiter)
|
||||||
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
{
|
||||||
|
out << "ERROR: garbage after " << current_column_type->getName() << ": ";
|
||||||
|
verbosePrintString(curr_position, std::min(curr_position + 10, in.buffer().end()), out);
|
||||||
|
out << "\n";
|
||||||
|
|
||||||
return false;
|
if (current_column_type->getName() == "DateTime")
|
||||||
|
out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
|
||||||
|
else if (current_column_type->getName() == "Date")
|
||||||
|
out << "ERROR: Date must be in YYYY-MM-DD format.\n";
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
static const String skipped_column_str = "<SKIPPED COLUMN>";
|
||||||
|
out << "Column " << file_column << ", " << std::string((file_column < 10 ? 2 : file_column < 100 ? 1 : 0), ' ')
|
||||||
|
<< "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
|
||||||
|
<< "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');
|
||||||
|
|
||||||
|
String tmp;
|
||||||
|
readCSVString(tmp, in, format_settings.csv);
|
||||||
|
}
|
||||||
|
|
||||||
/// Delimiters
|
/// Delimiters
|
||||||
if (i + 1 == size)
|
if (file_column + 1 == column_indexes_for_input_fields.size())
|
||||||
{
|
{
|
||||||
if (in.eof())
|
if (in.eof())
|
||||||
return false;
|
return false;
|
||||||
@ -294,8 +461,8 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|||||||
out << "ERROR: There is no line feed. ";
|
out << "ERROR: There is no line feed. ";
|
||||||
verbosePrintString(in.position(), in.position() + 1, out);
|
verbosePrintString(in.position(), in.position() + 1, out);
|
||||||
out << " found instead.\n"
|
out << " found instead.\n"
|
||||||
" It's like your file has more columns than expected.\n"
|
" It's like your file has more columns than expected.\n"
|
||||||
"And if your file have right number of columns, maybe it have unquoted string value with comma.\n";
|
"And if your file have right number of columns, maybe it have unquoted string value with comma.\n";
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -313,8 +480,8 @@ bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
|||||||
if (*in.position() == '\n' || *in.position() == '\r')
|
if (*in.position() == '\n' || *in.position() == '\r')
|
||||||
{
|
{
|
||||||
out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
|
out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
|
||||||
" It's like your file has less columns than expected.\n"
|
" It's like your file has less columns than expected.\n"
|
||||||
"And if your file have right number of columns, maybe it have unescaped quotes in values.\n";
|
"And if your file have right number of columns, maybe it have unescaped quotes in values.\n";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -347,6 +514,45 @@ void CSVRowInputFormat::updateDiagnosticInfo()
|
|||||||
pos_of_current_row = in.position();
|
pos_of_current_row = in.position();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx)
|
||||||
|
{
|
||||||
|
const bool at_delimiter = *in.position() == format_settings.csv.delimiter;
|
||||||
|
const bool at_last_column_line_end = is_last_file_column
|
||||||
|
&& (*in.position() == '\n' || *in.position() == '\r'
|
||||||
|
|| in.eof());
|
||||||
|
|
||||||
|
if (format_settings.csv.empty_as_default
|
||||||
|
&& (at_delimiter || at_last_column_line_end))
|
||||||
|
{
|
||||||
|
/// Treat empty unquoted column value as default value, if
|
||||||
|
/// specified in the settings. Tuple columns might seem
|
||||||
|
/// problematic, because they are never quoted but still contain
|
||||||
|
/// commas, which might be also used as delimiters. However,
|
||||||
|
/// they do not contain empty unquoted fields, so this check
|
||||||
|
/// works for tuples as well.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else if (column_idx_to_nullable_column_idx[column_idx])
|
||||||
|
{
|
||||||
|
/// If value is null but type is not nullable then use default value instead.
|
||||||
|
const size_t nullable_idx = *column_idx_to_nullable_column_idx[column_idx];
|
||||||
|
auto & tmp_col = *nullable_columns[nullable_idx];
|
||||||
|
nullable_types[nullable_idx]->deserializeAsTextCSV(tmp_col, in, format_settings);
|
||||||
|
Field value = tmp_col[0];
|
||||||
|
tmp_col.popBack(1); /// do not store copy of values in memory
|
||||||
|
if (value.isNull())
|
||||||
|
return false;
|
||||||
|
column.insert(value);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/// Read the column normally.
|
||||||
|
type->deserializeAsTextCSV(column, in, format_settings);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void registerInputFormatProcessorCSV(FormatFactory & factory)
|
void registerInputFormatProcessorCSV(FormatFactory & factory)
|
||||||
{
|
{
|
||||||
@ -359,7 +565,7 @@ void registerInputFormatProcessorCSV(FormatFactory & factory)
|
|||||||
IRowInputFormat::Params params,
|
IRowInputFormat::Params params,
|
||||||
const FormatSettings & settings)
|
const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
return std::make_shared<CSVRowInputFormat>(buf, sample, params, with_names, settings);
|
return std::make_shared<CSVRowInputFormat>(buf, sample, std::move(params), with_names, settings);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,8 +36,26 @@ private:
|
|||||||
|
|
||||||
const FormatSettings format_settings;
|
const FormatSettings format_settings;
|
||||||
|
|
||||||
/// For convenient diagnostics in case of an error.
|
using IndexesMap = std::unordered_map<String, size_t>;
|
||||||
|
IndexesMap column_indexes_by_names;
|
||||||
|
|
||||||
|
/// Maps indexes of columns in the input file to indexes of table columns
|
||||||
|
using OptionalIndexes = std::vector<std::optional<size_t>>;
|
||||||
|
OptionalIndexes column_indexes_for_input_fields;
|
||||||
|
|
||||||
|
/// Tracks which colums we have read in a single read() call.
|
||||||
|
/// For columns that are never read, it is initialized to false when we
|
||||||
|
/// read the file header, and never changed afterwards.
|
||||||
|
/// For other columns, it is updated on each read() call.
|
||||||
|
std::vector<UInt8> read_columns;
|
||||||
|
|
||||||
|
/// Whether we have any columns that are not read from file at all,
|
||||||
|
/// and must be always initialized with defaults.
|
||||||
|
bool have_always_default_columns = false;
|
||||||
|
|
||||||
|
void addInputColumn(const String & column_name);
|
||||||
|
|
||||||
|
/// For convenient diagnostics in case of an error.
|
||||||
size_t row_num = 0;
|
size_t row_num = 0;
|
||||||
|
|
||||||
/// How many bytes were read, not counting those that are still in the buffer.
|
/// How many bytes were read, not counting those that are still in the buffer.
|
||||||
@ -47,10 +65,17 @@ private:
|
|||||||
char * pos_of_current_row = nullptr;
|
char * pos_of_current_row = nullptr;
|
||||||
char * pos_of_prev_row = nullptr;
|
char * pos_of_prev_row = nullptr;
|
||||||
|
|
||||||
|
/// For setting input_format_null_as_default
|
||||||
|
DataTypes nullable_types;
|
||||||
|
MutableColumns nullable_columns;
|
||||||
|
OptionalIndexes column_idx_to_nullable_column_idx;
|
||||||
|
|
||||||
void updateDiagnosticInfo();
|
void updateDiagnosticInfo();
|
||||||
|
|
||||||
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
|
||||||
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
|
WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name);
|
||||||
|
|
||||||
|
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column, size_t column_idx);
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
#include "config_formats.h"
|
#include "config_formats.h"
|
||||||
|
#include <Processors/Formats/Impl/CapnProtoRowInputFormat.h> // Y_IGNORE
|
||||||
#if USE_CAPNP
|
#if USE_CAPNP
|
||||||
|
|
||||||
#include <IO/ReadBuffer.h>
|
#include <IO/ReadBuffer.h>
|
||||||
#include <Interpreters/Context.h>
|
#include <Interpreters/Context.h>
|
||||||
#include <Processors/Formats/Impl/CapnProtoRowInputFormat.h> // Y_IGNORE
|
|
||||||
#include <Formats/FormatFactory.h>
|
#include <Formats/FormatFactory.h>
|
||||||
#include <Formats/FormatSchemaInfo.h>
|
#include <Formats/FormatSchemaInfo.h>
|
||||||
#include <capnp/serialize.h> // Y_IGNORE
|
#include <capnp/serialize.h> // Y_IGNORE
|
||||||
@ -179,7 +179,7 @@ void CapnProtoRowInputFormat::createActions(const NestedFieldList & sorted_field
|
|||||||
}
|
}
|
||||||
|
|
||||||
CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params, const FormatSchemaInfo & info)
|
CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params, const FormatSchemaInfo & info)
|
||||||
: IRowInputFormat(std::move(header), in_, params), parser(std::make_shared<SchemaParser>())
|
: IRowInputFormat(std::move(header), in_, std::move(params)), parser(std::make_shared<SchemaParser>())
|
||||||
{
|
{
|
||||||
// Parse the schema and fetch the root object
|
// Parse the schema and fetch the root object
|
||||||
|
|
||||||
@ -206,28 +206,42 @@ CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header,
|
|||||||
createActions(list, root);
|
createActions(list, root);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kj::Array<capnp::word> CapnProtoRowInputFormat::readMessage()
|
||||||
|
{
|
||||||
|
uint32_t segment_count;
|
||||||
|
in.readStrict(reinterpret_cast<char*>(&segment_count), sizeof(uint32_t));
|
||||||
|
|
||||||
|
// one for segmentCount and one because segmentCount starts from 0
|
||||||
|
const auto prefix_size = (2 + segment_count) * sizeof(uint32_t);
|
||||||
|
const auto words_prefix_size = (segment_count + 1) / 2 + 1;
|
||||||
|
auto prefix = kj::heapArray<capnp::word>(words_prefix_size);
|
||||||
|
auto prefix_chars = prefix.asChars();
|
||||||
|
::memcpy(prefix_chars.begin(), &segment_count, sizeof(uint32_t));
|
||||||
|
|
||||||
|
// read size of each segment
|
||||||
|
for (size_t i = 0; i <= segment_count; ++i)
|
||||||
|
in.readStrict(prefix_chars.begin() + ((i + 1) * sizeof(uint32_t)), sizeof(uint32_t));
|
||||||
|
|
||||||
|
// calculate size of message
|
||||||
|
const auto expected_words = capnp::expectedSizeInWordsFromPrefix(prefix);
|
||||||
|
const auto expected_bytes = expected_words * sizeof(capnp::word);
|
||||||
|
const auto data_size = expected_bytes - prefix_size;
|
||||||
|
auto msg = kj::heapArray<capnp::word>(expected_words);
|
||||||
|
auto msg_chars = msg.asChars();
|
||||||
|
|
||||||
|
// read full message
|
||||||
|
::memcpy(msg_chars.begin(), prefix_chars.begin(), prefix_size);
|
||||||
|
in.readStrict(msg_chars.begin() + prefix_size, data_size);
|
||||||
|
|
||||||
|
return msg;
|
||||||
|
}
|
||||||
|
|
||||||
bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
|
bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
|
||||||
{
|
{
|
||||||
if (in.eof())
|
if (in.eof())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Read from underlying buffer directly
|
auto array = readMessage();
|
||||||
auto buf = in.buffer();
|
|
||||||
auto base = reinterpret_cast<const capnp::word *>(in.position());
|
|
||||||
|
|
||||||
// Check if there's enough bytes in the buffer to read the full message
|
|
||||||
kj::Array<capnp::word> heap_array;
|
|
||||||
auto array = kj::arrayPtr(base, buf.size() - in.offset());
|
|
||||||
auto expected_words = capnp::expectedSizeInWordsFromPrefix(array);
|
|
||||||
if (expected_words * sizeof(capnp::word) > array.size())
|
|
||||||
{
|
|
||||||
// We'll need to reassemble the message in a contiguous buffer
|
|
||||||
heap_array = kj::heapArray<capnp::word>(expected_words);
|
|
||||||
in.readStrict(heap_array.asChars().begin(), heap_array.asChars().size());
|
|
||||||
array = heap_array.asPtr();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#if CAPNP_VERSION >= 8000
|
#if CAPNP_VERSION >= 8000
|
||||||
capnp::UnalignedFlatArrayMessageReader msg(array);
|
capnp::UnalignedFlatArrayMessageReader msg(array);
|
||||||
@ -281,13 +295,6 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Advance buffer position if used directly
|
|
||||||
if (heap_array.size() == 0)
|
|
||||||
{
|
|
||||||
auto parsed = (msg.getEnd() - base) * sizeof(capnp::word);
|
|
||||||
in.position() += parsed;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -297,7 +304,7 @@ void registerInputFormatProcessorCapnProto(FormatFactory & factory)
|
|||||||
"CapnProto",
|
"CapnProto",
|
||||||
[](ReadBuffer & buf, const Block & sample, const Context & context, IRowInputFormat::Params params, const FormatSettings &)
|
[](ReadBuffer & buf, const Block & sample, const Context & context, IRowInputFormat::Params params, const FormatSettings &)
|
||||||
{
|
{
|
||||||
return std::make_shared<CapnProtoRowInputFormat>(buf, sample, params, FormatSchemaInfo(context, "capnp"));
|
return std::make_shared<CapnProtoRowInputFormat>(buf, sample, std::move(params), FormatSchemaInfo(context, "CapnProto"));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,6 +40,8 @@ public:
|
|||||||
bool readRow(MutableColumns & columns, RowReadExtension &) override;
|
bool readRow(MutableColumns & columns, RowReadExtension &) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
kj::Array<capnp::word> readMessage();
|
||||||
|
|
||||||
// Build a traversal plan from a sorted list of fields
|
// Build a traversal plan from a sorted list of fields
|
||||||
void createActions(const NestedFieldList & sortedFields, capnp::StructSchema reader);
|
void createActions(const NestedFieldList & sortedFields, capnp::StructSchema reader);
|
||||||
|
|
||||||
|
@ -34,8 +34,6 @@ void JSONCompactRowOutputFormat::writeTotalsFieldDelimiter()
|
|||||||
|
|
||||||
void JSONCompactRowOutputFormat::writeRowStartDelimiter()
|
void JSONCompactRowOutputFormat::writeRowStartDelimiter()
|
||||||
{
|
{
|
||||||
if (row_count > 0)
|
|
||||||
writeCString(",\n", *ostr);
|
|
||||||
writeCString("\t\t[", *ostr);
|
writeCString("\t\t[", *ostr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ enum
|
|||||||
|
|
||||||
JSONEachRowRowInputFormat::JSONEachRowRowInputFormat(
|
JSONEachRowRowInputFormat::JSONEachRowRowInputFormat(
|
||||||
ReadBuffer & in_, const Block & header, Params params, const FormatSettings & format_settings)
|
ReadBuffer & in_, const Block & header, Params params, const FormatSettings & format_settings)
|
||||||
: IRowInputFormat(header, in_, params), format_settings(format_settings), name_map(header.columns())
|
: IRowInputFormat(header, in_, std::move(params)), format_settings(format_settings), name_map(header.columns())
|
||||||
{
|
{
|
||||||
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
||||||
skipBOMIfExists(in);
|
skipBOMIfExists(in);
|
||||||
@ -263,7 +263,7 @@ void registerInputFormatProcessorJSONEachRow(FormatFactory & factory)
|
|||||||
IRowInputFormat::Params params,
|
IRowInputFormat::Params params,
|
||||||
const FormatSettings & settings)
|
const FormatSettings & settings)
|
||||||
{
|
{
|
||||||
return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, params, settings);
|
return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, std::move(params), settings);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ class ReadBuffer;
|
|||||||
|
|
||||||
|
|
||||||
/** A stream for reading data in JSON format, where each row is represented by a separate JSON object.
|
/** A stream for reading data in JSON format, where each row is represented by a separate JSON object.
|
||||||
* Objects can be separated by feed return, other whitespace characters in any number and possibly a comma.
|
* Objects can be separated by line feed, other whitespace characters in any number and possibly a comma.
|
||||||
* Fields can be listed in any order (including, in different lines there may be different order),
|
* Fields can be listed in any order (including, in different lines there may be different order),
|
||||||
* and some fields may be missing.
|
* and some fields may be missing.
|
||||||
*/
|
*/
|
||||||
|
@ -24,6 +24,11 @@ public:
|
|||||||
void writeRowStartDelimiter() override;
|
void writeRowStartDelimiter() override;
|
||||||
void writeRowEndDelimiter() override;
|
void writeRowEndDelimiter() override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
/// No totals and extremes.
|
||||||
|
void consumeTotals(Chunk) override {}
|
||||||
|
void consumeExtremes(Chunk) override {}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
size_t field_number = 0;
|
size_t field_number = 0;
|
||||||
Names fields;
|
Names fields;
|
||||||
|
@ -17,39 +17,47 @@ using namespace MySQLProtocol;
|
|||||||
MySQLOutputFormat::MySQLOutputFormat(WriteBuffer & out_, const Block & header, const Context & context, const FormatSettings & settings)
|
MySQLOutputFormat::MySQLOutputFormat(WriteBuffer & out_, const Block & header, const Context & context, const FormatSettings & settings)
|
||||||
: IOutputFormat(header, out_)
|
: IOutputFormat(header, out_)
|
||||||
, context(context)
|
, context(context)
|
||||||
, packet_sender(std::make_shared<PacketSender>(out, const_cast<uint8_t &>(context.mysql.sequence_id))) /// TODO: fix it
|
, packet_sender(out, const_cast<uint8_t &>(context.mysql.sequence_id)) /// TODO: fix it
|
||||||
, format_settings(settings)
|
, format_settings(settings)
|
||||||
{
|
{
|
||||||
|
packet_sender.max_packet_size = context.mysql.max_packet_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void MySQLOutputFormat::initialize()
|
||||||
|
{
|
||||||
|
if (initialized)
|
||||||
|
return;
|
||||||
|
|
||||||
|
initialized = true;
|
||||||
|
auto & header = getPort(PortKind::Main).getHeader();
|
||||||
|
|
||||||
|
|
||||||
|
if (header.columns())
|
||||||
|
{
|
||||||
|
|
||||||
|
packet_sender.sendPacket(LengthEncodedNumber(header.columns()));
|
||||||
|
|
||||||
|
for (const ColumnWithTypeAndName & column : header.getColumnsWithTypeAndName())
|
||||||
|
{
|
||||||
|
ColumnDefinition column_definition(column.name, CharacterSet::binary, 0, ColumnType::MYSQL_TYPE_STRING,
|
||||||
|
0, 0);
|
||||||
|
packet_sender.sendPacket(column_definition);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(context.mysql.client_capabilities & Capability::CLIENT_DEPRECATE_EOF))
|
||||||
|
{
|
||||||
|
packet_sender.sendPacket(EOF_Packet(0, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void MySQLOutputFormat::consume(Chunk chunk)
|
void MySQLOutputFormat::consume(Chunk chunk)
|
||||||
{
|
{
|
||||||
|
initialize();
|
||||||
|
|
||||||
auto & header = getPort(PortKind::Main).getHeader();
|
auto & header = getPort(PortKind::Main).getHeader();
|
||||||
|
|
||||||
if (!initialized)
|
|
||||||
{
|
|
||||||
initialized = true;
|
|
||||||
|
|
||||||
|
|
||||||
if (header.columns())
|
|
||||||
{
|
|
||||||
|
|
||||||
packet_sender->sendPacket(LengthEncodedNumber(header.columns()));
|
|
||||||
|
|
||||||
for (const ColumnWithTypeAndName & column : header.getColumnsWithTypeAndName())
|
|
||||||
{
|
|
||||||
ColumnDefinition column_definition(column.name, CharacterSet::binary, 0, ColumnType::MYSQL_TYPE_STRING,
|
|
||||||
0, 0);
|
|
||||||
packet_sender->sendPacket(column_definition);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!(context.mysql.client_capabilities & Capability::CLIENT_DEPRECATE_EOF))
|
|
||||||
{
|
|
||||||
packet_sender->sendPacket(EOF_Packet(0, 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t rows = chunk.getNumRows();
|
size_t rows = chunk.getNumRows();
|
||||||
auto & columns = chunk.getColumns();
|
auto & columns = chunk.getColumns();
|
||||||
|
|
||||||
@ -58,14 +66,11 @@ void MySQLOutputFormat::consume(Chunk chunk)
|
|||||||
ResultsetRow row_packet;
|
ResultsetRow row_packet;
|
||||||
for (size_t col = 0; col < columns.size(); ++col)
|
for (size_t col = 0; col < columns.size(); ++col)
|
||||||
{
|
{
|
||||||
String column_value;
|
WriteBufferFromOwnString ostr;
|
||||||
WriteBufferFromString ostr(column_value);
|
|
||||||
header.getByPosition(col).type->serializeAsText(*columns[col], i, ostr, format_settings);
|
header.getByPosition(col).type->serializeAsText(*columns[col], i, ostr, format_settings);
|
||||||
ostr.finish();
|
row_packet.appendColumn(std::move(ostr.str()));
|
||||||
|
|
||||||
row_packet.appendColumn(std::move(column_value));
|
|
||||||
}
|
}
|
||||||
packet_sender->sendPacket(row_packet);
|
packet_sender.sendPacket(row_packet);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,15 +89,19 @@ void MySQLOutputFormat::finalize()
|
|||||||
<< formatReadableSizeWithBinarySuffix(info.read_bytes / info.elapsed_seconds) << "/sec.";
|
<< formatReadableSizeWithBinarySuffix(info.read_bytes / info.elapsed_seconds) << "/sec.";
|
||||||
}
|
}
|
||||||
|
|
||||||
auto & header = getPort(PortKind::Main).getHeader();
|
const auto & header = getPort(PortKind::Main).getHeader();
|
||||||
|
|
||||||
if (header.columns() == 0)
|
if (header.columns() == 0)
|
||||||
packet_sender->sendPacket(OK_Packet(0x0, context.mysql.client_capabilities, affected_rows, 0, 0, "", human_readable_info.str()), true);
|
packet_sender.sendPacket(OK_Packet(0x0, context.mysql.client_capabilities, affected_rows, 0, 0, "", human_readable_info.str()), true);
|
||||||
else
|
else
|
||||||
if (context.mysql.client_capabilities & CLIENT_DEPRECATE_EOF)
|
if (context.mysql.client_capabilities & CLIENT_DEPRECATE_EOF)
|
||||||
packet_sender->sendPacket(OK_Packet(0xfe, context.mysql.client_capabilities, affected_rows, 0, 0, "", human_readable_info.str()), true);
|
packet_sender.sendPacket(OK_Packet(0xfe, context.mysql.client_capabilities, affected_rows, 0, 0, "", human_readable_info.str()), true);
|
||||||
else
|
else
|
||||||
packet_sender->sendPacket(EOF_Packet(0, 0), true);
|
packet_sender.sendPacket(EOF_Packet(0, 0), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MySQLOutputFormat::flush()
|
||||||
|
{
|
||||||
|
packet_sender.out->next();
|
||||||
}
|
}
|
||||||
|
|
||||||
void registerOutputFormatProcessorMySQLWrite(FormatFactory & factory)
|
void registerOutputFormatProcessorMySQLWrite(FormatFactory & factory)
|
||||||
|
@ -25,13 +25,17 @@ public:
|
|||||||
|
|
||||||
void consume(Chunk) override;
|
void consume(Chunk) override;
|
||||||
void finalize() override;
|
void finalize() override;
|
||||||
|
void flush() override;
|
||||||
|
void doWritePrefix() override { initialize(); }
|
||||||
|
|
||||||
|
void initialize();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
bool initialized = false;
|
bool initialized = false;
|
||||||
|
|
||||||
const Context & context;
|
const Context & context;
|
||||||
std::shared_ptr<MySQLProtocol::PacketSender> packet_sender;
|
MySQLProtocol::PacketSender packet_sender;
|
||||||
FormatSettings format_settings;
|
FormatSettings format_settings;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#include <Core/iostream_debug_helpers.h>
|
#include <Core/iostream_debug_helpers.h>
|
||||||
|
#include <DataTypes/DataTypeLowCardinality.h>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -38,7 +39,7 @@ void ODBCDriver2BlockOutputFormat::writeRow(const Block & header, const Columns
|
|||||||
{
|
{
|
||||||
{
|
{
|
||||||
WriteBufferFromString text_out(buffer);
|
WriteBufferFromString text_out(buffer);
|
||||||
header.getByPosition(row_idx).type->serializeAsText(*column, row_idx, text_out, format_settings);
|
header.getByPosition(column_idx).type->serializeAsText(*column, row_idx, text_out, format_settings);
|
||||||
}
|
}
|
||||||
writeODBCString(out, buffer);
|
writeODBCString(out, buffer);
|
||||||
}
|
}
|
||||||
@ -95,8 +96,10 @@ void ODBCDriver2BlockOutputFormat::writePrefix()
|
|||||||
writeODBCString(out, "type");
|
writeODBCString(out, "type");
|
||||||
for (size_t i = 0; i < columns; ++i)
|
for (size_t i = 0; i < columns; ++i)
|
||||||
{
|
{
|
||||||
const ColumnWithTypeAndName & col = header.getByPosition(i);
|
auto type = header.getByPosition(i).type;
|
||||||
writeODBCString(out, col.type->getName());
|
if (type->lowCardinality())
|
||||||
|
type = recursiveRemoveLowCardinality(type);
|
||||||
|
writeODBCString(out, type->getName());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
#include "config_formats.h"
|
#include "config_formats.h"
|
||||||
|
#include <Processors/Formats/Impl/ParquetBlockInputFormat.h>
|
||||||
|
|
||||||
#if USE_PARQUET
|
#if USE_PARQUET
|
||||||
#include "ParquetBlockInputFormat.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -29,15 +28,9 @@
|
|||||||
#include <common/DateLUTImpl.h>
|
#include <common/DateLUTImpl.h>
|
||||||
#include <ext/range.h>
|
#include <ext/range.h>
|
||||||
#include <arrow/api.h>
|
#include <arrow/api.h>
|
||||||
//#include <arrow/buffer.h>
|
|
||||||
//#include <arrow/io/api.h>
|
|
||||||
#include <parquet/arrow/reader.h>
|
#include <parquet/arrow/reader.h>
|
||||||
//#include <parquet/arrow/writer.h>
|
|
||||||
//#include <parquet/exception.h>
|
|
||||||
#include <parquet/file_reader.h>
|
#include <parquet/file_reader.h>
|
||||||
|
|
||||||
#include <Core/iostream_debug_helpers.h> // REMOVE ME
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
namespace ErrorCodes
|
namespace ErrorCodes
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <Common/config.h>
|
#include "config_formats.h"
|
||||||
#if USE_PARQUET
|
#if USE_PARQUET
|
||||||
# include <Processors/Formats/IInputFormat.h>
|
|
||||||
//# include <parquet/file_reader.h>
|
#include <Processors/Formats/IInputFormat.h>
|
||||||
//# include <parquet/arrow/reader.h>
|
|
||||||
//# include <arrow/buffer.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace parquet { namespace arrow { class FileReader; } }
|
namespace parquet { namespace arrow { class FileReader; } }
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#include "config_formats.h"
|
#include "config_formats.h"
|
||||||
|
#include <Processors/Formats/Impl/ParquetBlockOutputFormat.h>
|
||||||
|
|
||||||
#if USE_PARQUET
|
#if USE_PARQUET
|
||||||
# include "ParquetBlockOutputFormat.h"
|
|
||||||
|
|
||||||
// TODO: clean includes
|
// TODO: clean includes
|
||||||
# include <Columns/ColumnDecimal.h>
|
# include <Columns/ColumnDecimal.h>
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user