From a7ddd8489a6c040cdcd9f280e6a500ab8e0c2dd1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 28 Sep 2020 03:11:19 +0300 Subject: [PATCH] Added format RawBLOB --- src/Formats/FormatFactory.cpp | 6 ++ .../Impl/JSONAsStringRowInputFormat.cpp | 17 ++++-- .../Formats/Impl/RawBLOBRowInputFormat.cpp | 55 +++++++++++++++++++ .../Formats/Impl/RawBLOBRowInputFormat.h | 24 ++++++++ .../Formats/Impl/RawBLOBRowOutputFormat.cpp | 38 +++++++++++++ .../Formats/Impl/RawBLOBRowOutputFormat.h | 41 ++++++++++++++ .../01509_format_raw_blob.reference | 2 + .../0_stateless/01509_format_raw_blob.sh | 18 ++++++ 8 files changed, 196 insertions(+), 5 deletions(-) create mode 100644 src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp create mode 100644 src/Processors/Formats/Impl/RawBLOBRowInputFormat.h create mode 100644 src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp create mode 100644 src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h create mode 100644 tests/queries/0_stateless/01509_format_raw_blob.reference create mode 100755 tests/queries/0_stateless/01509_format_raw_blob.sh diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 522149d3cfd..e9f37ff9752 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -367,6 +367,8 @@ void registerInputFormatProcessorArrow(FormatFactory & factory); void registerOutputFormatProcessorArrow(FormatFactory & factory); void registerInputFormatProcessorAvro(FormatFactory & factory); void registerOutputFormatProcessorAvro(FormatFactory & factory); +void registerInputFormatProcessorRawBLOB(FormatFactory & factory); +void registerOutputFormatProcessorRawBLOB(FormatFactory & factory); /// Output only (presentational) formats. @@ -426,6 +428,9 @@ FormatFactory::FormatFactory() registerOutputFormatProcessorTemplate(*this); registerInputFormatProcessorMsgPack(*this); registerOutputFormatProcessorMsgPack(*this); + registerInputFormatProcessorRawBLOB(*this); + registerOutputFormatProcessorRawBLOB(*this); + #if !defined(ARCADIA_BUILD) registerInputFormatProcessorORC(*this); registerOutputFormatProcessorORC(*this); @@ -456,6 +461,7 @@ FormatFactory::FormatFactory() registerInputFormatProcessorRegexp(*this); registerInputFormatProcessorJSONAsString(*this); registerInputFormatProcessorLineAsString(*this); + #if !defined(ARCADIA_BUILD) registerInputFormatProcessorCapnProto(*this); #endif diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 42fa764f011..bc57803152f 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include @@ -8,17 +10,22 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; extern const int INCORRECT_DATA; } JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) : IRowInputFormat(header_, in_, std::move(params_)), buf(in) { - if (header_.columns() > 1 || header_.getDataTypes()[0]->getTypeId() != TypeIndex::String) - { - throw Exception("This input format is only suitable for tables with a single column of type String.", ErrorCodes::LOGICAL_ERROR); - } + if (header_.columns() > 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "This input format is only suitable for tables with a single column of type String but the number of columns is {}", + header_.columns()); + + if (!isString(removeNullable(removeLowCardinality(header_.getByPosition(0).type)))) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "This input format is only suitable for tables with a single column of type String but the column type is {}", + header_.getByPosition(0).type->getName()); } void JSONAsStringRowInputFormat::resetParser() diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp new file mode 100644 index 00000000000..bd9eaada05e --- /dev/null +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +RawBLOBRowInputFormat::RawBLOBRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) + : IRowInputFormat(header_, in_, std::move(params_)) +{ + if (header_.columns() > 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "This input format is only suitable for tables with a single column of type String but the number of columns is {}", + header_.columns()); + + if (!isString(removeNullable(removeLowCardinality(header_.getByPosition(0).type)))) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "This input format is only suitable for tables with a single column of type String but the column type is {}", + header_.getByPosition(0).type->getName()); +} + +bool RawBLOBRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +{ + if (in.eof()) + return false; + + /// One excessive copy. + String blob; + readStringUntilEOF(blob, in); + columns.at(0)->insertData(blob.data(), blob.size()); + return false; +} + +void registerInputFormatProcessorRawBLOB(FormatFactory & factory) +{ + factory.registerInputFormatProcessor("RawBLOB", []( + ReadBuffer & buf, + const Block & sample, + const RowInputFormatParams & params, + const FormatSettings &) + { + return std::make_shared(sample, buf, params); + }); +} + +} + diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h new file mode 100644 index 00000000000..fd2c849687a --- /dev/null +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h @@ -0,0 +1,24 @@ +#pragma once + +#include + + +namespace DB +{ + +class ReadBuffer; + +/// This format slurps all input data into single value. +/// This format can only parse a table with single field of type String or similar. + +class RawBLOBRowInputFormat : public IRowInputFormat +{ +public: + RawBLOBRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); + + bool readRow(MutableColumns & columns, RowReadExtension &) override; + String getName() const override { return "RawBLOBRowInputFormat"; } +}; + +} + diff --git a/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp new file mode 100644 index 00000000000..786edce5edc --- /dev/null +++ b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.cpp @@ -0,0 +1,38 @@ +#include +#include +#include + +namespace DB +{ + + +RawBLOBRowOutputFormat::RawBLOBRowOutputFormat( + WriteBuffer & out_, + const Block & header_, + FormatFactory::WriteCallback callback) + : IRowOutputFormat(header_, out_, callback) +{ +} + + +void RawBLOBRowOutputFormat::writeField(const IColumn & column, const IDataType &, size_t row_num) +{ + StringRef value = column.getDataAt(row_num); + out.write(value.data, value.size); +} + + +void registerOutputFormatProcessorRawBLOB(FormatFactory & factory) +{ + factory.registerOutputFormatProcessor("RawBLOB", []( + WriteBuffer & buf, + const Block & sample, + FormatFactory::WriteCallback callback, + const FormatSettings &) + { + return std::make_shared(buf, sample, callback); + }); +} + +} + diff --git a/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h new file mode 100644 index 00000000000..8062092f5cd --- /dev/null +++ b/src/Processors/Formats/Impl/RawBLOBRowOutputFormat.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +class WriteBuffer; + + +/** This format only allows to output one column of type String or similar. + * It is output as raw bytes without any delimiters or escaping. + * + * The difference between RawBLOB and TSVRaw: + * - only single column dataset is supported; + * - data is output in binary; + * - no newline at the end of each value. + * + * The difference between RawBLOB and RowBinary: + * - only single column dataset is supported; + * - strings are output without their lengths. + * + * If you are output more than one value, the output format is ambiguous and you may not be able to read data back. + */ +class RawBLOBRowOutputFormat : public IRowOutputFormat +{ +public: + RawBLOBRowOutputFormat( + WriteBuffer & out_, + const Block & header_, + FormatFactory::WriteCallback callback); + + String getName() const override { return "RawBLOBRowOutputFormat"; } + + void writeField(const IColumn & column, const IDataType &, size_t row_num) override; +}; + +} + diff --git a/tests/queries/0_stateless/01509_format_raw_blob.reference b/tests/queries/0_stateless/01509_format_raw_blob.reference new file mode 100644 index 00000000000..dfa8f538e67 --- /dev/null +++ b/tests/queries/0_stateless/01509_format_raw_blob.reference @@ -0,0 +1,2 @@ +96b229180107fd2d23fd0a2ef9326701 - +96b229180107fd2d23fd0a2ef9326701 - diff --git a/tests/queries/0_stateless/01509_format_raw_blob.sh b/tests/queries/0_stateless/01509_format_raw_blob.sh new file mode 100755 index 00000000000..68d3844d727 --- /dev/null +++ b/tests/queries/0_stateless/01509_format_raw_blob.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -n --query " +DROP TABLE IF EXISTS t; +CREATE TABLE t (a LowCardinality(Nullable(String))) ENGINE = Memory; +" + +${CLICKHOUSE_CLIENT} --query "INSERT INTO t FORMAT RawBLOB" < ${BASH_SOURCE[0]} + +cat ${BASH_SOURCE[0]} | md5sum +${CLICKHOUSE_CLIENT} -n --query "SELECT * FROM t FORMAT RawBLOB" | md5sum + +${CLICKHOUSE_CLIENT} --query " +DROP TABLE t; +"