From 6a5daca135df39601585fef3c02f940ee162f6c1 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 8 Jun 2021 12:33:04 +0300 Subject: [PATCH] dynamic subcolumns: new format and several fixes --- src/Columns/ColumnObject.cpp | 32 +++++-- src/Columns/IColumnImpl.h | 2 +- src/DataTypes/FieldToDataType.cpp | 8 +- .../Serializations/SerializationObject.cpp | 1 + src/Formats/registerFormats.cpp | 4 + src/Interpreters/TreeRewriter.cpp | 2 +- .../Impl/JSONAsObjectRowInputFormat.cpp | 84 +++++++++++++++++++ tests/queries/0_stateless/01825_type_json.sh | 40 --------- 8 files changed, 122 insertions(+), 51 deletions(-) create mode 100644 src/Processors/Formats/Impl/JSONAsObjectRowInputFormat.cpp delete mode 100644 tests/queries/0_stateless/01825_type_json.sh diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index e6e7621c908..5ddfc75eaf1 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -7,7 +7,9 @@ #include #include -#include +#include +#include + namespace DB { @@ -169,6 +171,17 @@ Names ColumnObject::getKeys() const return keys; } +static bool isPrefix(const Strings & prefix, const Strings & strings) +{ + if (prefix.size() > strings.size()) + return false; + + for (size_t i = 0; i < prefix.size(); ++i) + if (prefix[i] != strings[i]) + return false; + return true; +} + void ColumnObject::optimizeTypesOfSubcolumns() { if (optimized_types_of_subcolumns) @@ -184,11 +197,20 @@ void ColumnObject::optimizeTypesOfSubcolumns() if (isNothing(getBaseTypeOfArray(to_type))) continue; - auto it = std::find_if(subcolumns.begin(), subcolumns.end(), - [&name = name](const auto & elem) { return elem.first.size() > name.size() && startsWith(elem.first, name); }); + Strings name_parts; + boost::split(name_parts, name, boost::is_any_of(".")); - if (it != subcolumns.end()) - throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Data in Object has ambiguous paths: '{}' and '{}", name, it->first); + for (const auto & [other_name, _] : subcolumns) + { + if (other_name.size() > name.size()) + { + Strings other_name_parts; + boost::split(other_name_parts, other_name, boost::is_any_of(".")); + + if (isPrefix(name_parts, other_name_parts)) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Data in Object has ambiguous paths: '{}' and '{}", name, other_name); + } + } if (to_type->equals(*from_type)) { diff --git a/src/Columns/IColumnImpl.h b/src/Columns/IColumnImpl.h index fe9ad251111..394d6ccbc5d 100644 --- a/src/Columns/IColumnImpl.h +++ b/src/Columns/IColumnImpl.h @@ -151,7 +151,7 @@ double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const return 0.0; size_t step = num_rows / num_sampled_rows; - std::uniform_int_distribution dist(1, step); + std::uniform_int_distribution dist(0, step - 1); size_t res = 0; for (size_t i = 0; i < num_rows; i += step) diff --git a/src/DataTypes/FieldToDataType.cpp b/src/DataTypes/FieldToDataType.cpp index 77b46c8d042..2a032db52a0 100644 --- a/src/DataTypes/FieldToDataType.cpp +++ b/src/DataTypes/FieldToDataType.cpp @@ -109,7 +109,7 @@ DataTypePtr FieldToDataType::operator() (const Array & x) const element_types.reserve(x.size()); for (const Field & elem : x) - element_types.emplace_back(applyVisitor(FieldToDataType(), elem)); + element_types.emplace_back(applyVisitor(FieldToDataType(allow_convertion_to_string), elem)); return std::make_shared(getLeastSupertype(element_types, allow_convertion_to_string)); } @@ -124,7 +124,7 @@ DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const element_types.reserve(ext::size(tuple)); for (const auto & element : tuple) - element_types.push_back(applyVisitor(FieldToDataType(), element)); + element_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), element)); return std::make_shared(element_types); } @@ -140,8 +140,8 @@ DataTypePtr FieldToDataType::operator() (const Map & map) const { const auto & tuple = elem.safeGet(); assert(tuple.size() == 2); - key_types.push_back(applyVisitor(FieldToDataType(), tuple[0])); - value_types.push_back(applyVisitor(FieldToDataType(), tuple[1])); + key_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[0])); + value_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[1])); } return std::make_shared( diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp index d38bfd5c769..7c0d8bf600d 100644 --- a/src/DataTypes/Serializations/SerializationObject.cpp +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -28,6 +28,7 @@ namespace ErrorCodes namespace { + class FieldVisitorReplaceNull : public StaticVisitor { public: diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 89fb7c6cc02..260cefdbeaf 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -15,6 +15,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory); void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory); void registerFileSegmentationEngineRegexp(FormatFactory & factory); void registerFileSegmentationEngineJSONAsString(FormatFactory & factory); +void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory); /// Formats for both input/output. @@ -76,6 +77,7 @@ void registerOutputFormatProcessorPostgreSQLWire(FormatFactory & factory); void registerInputFormatProcessorRegexp(FormatFactory & factory); void registerInputFormatProcessorJSONAsString(FormatFactory & factory); +void registerInputFormatProcessorJSONAsObject(FormatFactory & factory); void registerInputFormatProcessorLineAsString(FormatFactory & factory); void registerInputFormatProcessorCapnProto(FormatFactory & factory); @@ -89,6 +91,7 @@ void registerFormats() registerFileSegmentationEngineJSONEachRow(factory); registerFileSegmentationEngineRegexp(factory); registerFileSegmentationEngineJSONAsString(factory); + registerFileSegmentationEngineJSONAsObject(factory); registerInputFormatNative(factory); registerOutputFormatNative(factory); @@ -147,6 +150,7 @@ void registerFormats() registerInputFormatProcessorRegexp(factory); registerInputFormatProcessorJSONAsString(factory); + registerInputFormatProcessorJSONAsObject(factory); registerInputFormatProcessorLineAsString(factory); #if !defined(ARCADIA_BUILD) diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 7305aa1c843..865685610ed 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -452,7 +452,7 @@ void getArrayJoinedColumns(ASTPtr & query, TreeRewriterResult & result, const AS bool found = false; for (const auto & column : source_columns) { - auto split = Nested::splitName(column.name); + auto split = Nested::splitName(column.name, /*reverse=*/ true); if (split.first == source_name && !split.second.empty()) { result.array_join_result_to_source[Nested::concatenateName(result_name, split.second)] = column.name; diff --git a/src/Processors/Formats/Impl/JSONAsObjectRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsObjectRowInputFormat.cpp new file mode 100644 index 00000000000..48550c50329 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONAsObjectRowInputFormat.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +class JSONAsObjectRowInputFormat : public IRowInputFormat +{ +public: + JSONAsObjectRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_); + + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; + String getName() const override { return "JSONAsObjectRowInputFormat"; } + +private: + const FormatSettings format_settings; +}; + +JSONAsObjectRowInputFormat::JSONAsObjectRowInputFormat( + ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : IRowInputFormat(header_, in_, std::move(params_)) + , format_settings(format_settings_) +{ + if (header_.columns() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Input format JSONAsObject is only suitable for tables with a single column of type Object but the number of columns is {}", + header_.columns()); + + if (!isObject(header_.getByPosition(0).type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Input format JSONAsObject is only suitable for tables with a single column of type Object but the column type is {}", + header_.getByPosition(0).type->getName()); +} + + +bool JSONAsObjectRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +{ + assert(serializations.size() == 1); + assert(columns.size() == 1); + + skipWhitespaceIfAny(in); + if (!in.eof()) + serializations[0]->deserializeTextJSON(*columns[0], in, format_settings); + + skipWhitespaceIfAny(in); + if (!in.eof() && *in.position() == ',') + ++in.position(); + skipWhitespaceIfAny(in); + + return !in.eof(); +} + +} + +void registerInputFormatProcessorJSONAsObject(FormatFactory & factory) +{ + factory.registerInputFormatProcessor("JSONAsObject", []( + ReadBuffer & buf, + const Block & sample, + IRowInputFormat::Params params, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, std::move(params), settings); + }); +} + +void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory) +{ + factory.registerFileSegmentationEngine("JSONAsObject", &fileSegmentationEngineJSONEachRowImpl); +} + +} diff --git a/tests/queries/0_stateless/01825_type_json.sh b/tests/queries/0_stateless/01825_type_json.sh deleted file mode 100644 index 59cce8b925c..00000000000 --- a/tests/queries/0_stateless/01825_type_json.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -set -e - -$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json" -$CLICKHOUSE_CLIENT -q "CREATE TABLE t_json(id UInt64, data Object('JSON')) \ - ENGINE = MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0" - -cat << EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json FORMAT CSV" -1,{"k1":"aa","k2":{"k3":"bb","k4":"c"}} -2,{"k1":"ee","k5":"ff"} -EOF - -cat << EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json FORMAT CSV" -3,{"k5":"foo"} -EOF - -$CLICKHOUSE_CLIENT -q "SELECT id, data.k1, data.k2.k3, data.k2.k4, data.k5 FROM t_json ORDER BY id" -$CLICKHOUSE_CLIENT -q "SELECT name, column, type \ - FROM system.parts_columns WHERE table = 't_json' AND database = '$CLICKHOUSE_DATABASE' ORDER BY name" - -$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json" - -$CLICKHOUSE_CLIENT -q "CREATE TABLE t_json(id UInt64, data Object('JSON')) \ - ENGINE = MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0" - -cat << EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json FORMAT CSV" -1,{"k1":[{"k2":"aaa","k3":[{"k4":"bbb"},{"k4":"ccc"}]},{"k2":"ddd","k3":[{"k4":"eee"},{"k4":"fff"}]}]} -EOF - -$CLICKHOUSE_CLIENT -q "SELECT id, data.k1.k2, data.k1.k3.k4 FROM t_json ORDER BY id" - -$CLICKHOUSE_CLIENT -q "SELECT name, column, type \ - FROM system.parts_columns WHERE table = 't_json' AND database = '$CLICKHOUSE_DATABASE' ORDER BY name" - -$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json"