From f1377b0b4abde46ef4c1cc162e33ad539692bfac Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 3 Sep 2024 14:10:28 +0000 Subject: [PATCH 1/3] Fix uniq and GROUP BY for JSON/Dynamic types --- src/Columns/ColumnDynamic.cpp | 16 ++ src/Columns/ColumnObject.cpp | 244 +++++++++++------- src/Columns/ColumnObject.h | 5 + .../03231_dynamic_uniq_group_by.reference | 5 + .../03231_dynamic_uniq_group_by.sql | 15 ++ .../03232_json_uniq_group_by.reference | 12 + .../0_stateless/03232_json_uniq_group_by.sql | 39 +++ 7 files changed, 236 insertions(+), 100 deletions(-) create mode 100644 tests/queries/0_stateless/03231_dynamic_uniq_group_by.reference create mode 100644 tests/queries/0_stateless/03231_dynamic_uniq_group_by.sql create mode 100644 tests/queries/0_stateless/03232_json_uniq_group_by.reference create mode 100644 tests/queries/0_stateless/03232_json_uniq_group_by.sql diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index 9b55879a4f0..0b1dc3c363a 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -816,6 +816,22 @@ void ColumnDynamic::updateHashWithValue(size_t n, SipHash & hash) const return; } + /// If it's not null we update hash with the type name and the actual value. + + /// If value in this row is in shared variant, deserialize type and value and + /// update hash with it. + if (discr == getSharedVariantDiscriminator()) + { + auto value = getSharedVariant().getDataAt(variant_col.offsetAt(n)); + ReadBufferFromMemory buf(value.data, value.size); + auto type = decodeDataType(buf); + hash.update(type->getName()); + auto tmp_column = type->createColumn(); + type->getDefaultSerialization()->deserializeBinary(*tmp_column, buf, getFormatSettings()); + tmp_column->updateHashWithValue(0, hash); + return; + } + hash.update(variant_info.variant_names[discr]); variant_col.getVariantByGlobalDiscriminator(discr).updateHashWithValue(variant_col.offsetAt(n), hash); } diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index e397b03b69e..920b0384448 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -47,15 +47,20 @@ ColumnObject::ColumnObject( , statistics(statistics_) { typed_paths.reserve(typed_paths_.size()); + sorted_typed_paths.reserve(typed_paths_.size()); for (auto & [path, column] : typed_paths_) - typed_paths[path] = std::move(column); + { + auto it = typed_paths.emplace(path, std::move(column)).first; + sorted_typed_paths.push_back(it->first); + } dynamic_paths.reserve(dynamic_paths_.size()); dynamic_paths_ptrs.reserve(dynamic_paths_.size()); for (auto & [path, column] : dynamic_paths_) { - dynamic_paths[path] = std::move(column); - dynamic_paths_ptrs[path] = assert_cast(dynamic_paths[path].get()); + auto it = dynamic_paths.emplace(path, std::move(column)).first; + dynamic_paths_ptrs[path] = assert_cast(it->second.get()); + sorted_dynamic_paths.insert(it->first); } } @@ -68,7 +73,8 @@ ColumnObject::ColumnObject( { if (!column->empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected non-empty typed path column in ColumnObject constructor"); - typed_paths[path] = std::move(column); + auto it = typed_paths.emplace(path, std::move(column)).first; + sorted_typed_paths.push_back(it->first); } MutableColumns paths_and_values; @@ -129,13 +135,8 @@ std::string ColumnObject::getName() const ss << "Object("; ss << "max_dynamic_paths=" << global_max_dynamic_paths; ss << ", max_dynamic_types=" << max_dynamic_types; - std::vector sorted_typed_paths; - sorted_typed_paths.reserve(typed_paths.size()); - for (const auto & [path, column] : typed_paths) - sorted_typed_paths.push_back(path); - std::sort(sorted_typed_paths.begin(), sorted_typed_paths.end()); for (const auto & path : sorted_typed_paths) - ss << ", " << path << " " << typed_paths.at(path)->getName(); + ss << ", " << path << " " << typed_paths.find(path)->second->getName(); ss << ")"; return ss.str(); } @@ -260,6 +261,7 @@ ColumnDynamic * ColumnObject::tryToAddNewDynamicPath(std::string_view path) new_dynamic_column->insertManyDefaults(size()); auto it = dynamic_paths.emplace(path, std::move(new_dynamic_column)).first; auto it_ptr = dynamic_paths_ptrs.emplace(path, assert_cast(it->second.get())).first; + sorted_dynamic_paths.insert(it->first); return it_ptr->second; } @@ -288,8 +290,9 @@ void ColumnObject::setDynamicPaths(const std::vector & paths) auto new_dynamic_column = ColumnDynamic::create(max_dynamic_types); if (size) new_dynamic_column->insertManyDefaults(size); - dynamic_paths[path] = std::move(new_dynamic_column); - dynamic_paths_ptrs[path] = assert_cast(dynamic_paths[path].get()); + auto it = dynamic_paths.emplace(path, std::move(new_dynamic_column)).first; + dynamic_paths_ptrs[path] = assert_cast(it->second.get()); + sorted_dynamic_paths.insert(it->first); } } @@ -658,39 +661,61 @@ void ColumnObject::popBack(size_t n) StringRef ColumnObject::serializeValueIntoArena(size_t n, Arena & arena, const char *& begin) const { StringRef res(begin, 0); - // Serialize all paths and values in binary format. + /// First serialize values from typed paths in sorted order. They are the same for all instances of this column. + for (auto path : sorted_typed_paths) + { + auto data_ref = typed_paths.find(path)->second->serializeValueIntoArena(n, arena, begin); + res.data = data_ref.data - res.size; + res.size += data_ref.size; + } + + /// Second, serialize paths and values in bunary format from dynamic paths and shared data in sorted by path order. + /// Calculate total number of paths to serialize and write it. const auto & shared_data_offsets = getSharedDataOffsets(); size_t offset = shared_data_offsets[static_cast(n) - 1]; size_t end = shared_data_offsets[static_cast(n)]; - size_t num_paths = typed_paths.size() + dynamic_paths.size() + (end - offset); + size_t num_paths = (end - offset); + /// Don't serialize Nulls from dynamic paths. + for (const auto & [_, column] : dynamic_paths) + num_paths += !column->isNullAt(n); + char * pos = arena.allocContinue(sizeof(size_t), begin); memcpy(pos, &num_paths, sizeof(size_t)); res.data = pos - res.size; res.size += sizeof(size_t); - /// Serialize paths and values from typed paths. - for (const auto & [path, column] : typed_paths) - { - size_t path_size = path.size(); - pos = arena.allocContinue(sizeof(size_t) + path_size, begin); - memcpy(pos, &path_size, sizeof(size_t)); - memcpy(pos + sizeof(size_t), path.data(), path_size); - auto data_ref = column->serializeValueIntoArena(n, arena, begin); - res.data = data_ref.data - res.size - sizeof(size_t) - path_size; - res.size += data_ref.size + sizeof(size_t) + path_size; - } - /// Serialize paths and values from dynamic paths. - for (const auto & [path, column] : dynamic_paths) - { - WriteBufferFromOwnString buf; - getDynamicSerialization()->serializeBinary(*column, n, buf, getFormatSettings()); - serializePathAndValueIntoArena(arena, begin, path, buf.str(), res); - } - - /// Serialize paths and values from shared data. + auto dynamic_paths_it = sorted_dynamic_paths.begin(); auto [shared_data_paths, shared_data_values] = getSharedDataPathsAndValues(); for (size_t i = offset; i != end; ++i) - serializePathAndValueIntoArena(arena, begin, shared_data_paths->getDataAt(i), shared_data_values->getDataAt(i), res); + { + auto path = shared_data_paths->getDataAt(i).toView(); + /// Paths in shared data are sorted. Serialize all paths from dynamic paths that go before this path in sorted order. + while (dynamic_paths_it != sorted_dynamic_paths.end() && *dynamic_paths_it < path) + { + const auto * dynamic_column = dynamic_paths_ptrs.find(*dynamic_paths_it)->second; + /// Don't serialize Nulls. + if (!dynamic_column->isNullAt(n)) + { + WriteBufferFromOwnString buf; + getDynamicSerialization()->serializeBinary(*dynamic_column, n, buf, getFormatSettings()); + serializePathAndValueIntoArena(arena, begin, StringRef(*dynamic_paths_it), buf.str(), res); + } + ++dynamic_paths_it; + } + serializePathAndValueIntoArena(arena, begin, StringRef(path), shared_data_values->getDataAt(i), res); + } + + /// Serialize all remaining paths in dynamic paths. + for (; dynamic_paths_it != sorted_dynamic_paths.end(); ++dynamic_paths_it) + { + const auto * dynamic_column = dynamic_paths_ptrs.find(*dynamic_paths_it)->second; + if (!dynamic_column->isNullAt(n)) + { + WriteBufferFromOwnString buf; + getDynamicSerialization()->serializeBinary(*dynamic_column, n, buf, getFormatSettings()); + serializePathAndValueIntoArena(arena, begin, StringRef(*dynamic_paths_it), buf.str(), res); + } + } return res; } @@ -711,70 +736,49 @@ void ColumnObject::serializePathAndValueIntoArena(DB::Arena & arena, const char const char * ColumnObject::deserializeAndInsertFromArena(const char * pos) { size_t current_size = size(); - /// Deserialize paths and values and insert them into typed paths, dynamic paths or shared data. - /// Serialized paths could be unsorted, so we will have to sort all paths that will be inserted into shared data. - std::vector> paths_and_values_for_shared_data; + /// First deserialize typed paths. They come first. + for (auto path : sorted_typed_paths) + pos = typed_paths.find(path)->second->deserializeAndInsertFromArena(pos); + + /// Second deserialize all other paths and values and insert them into dynamic paths or shared data. auto num_paths = unalignedLoad(pos); pos += sizeof(size_t); + const auto [shared_data_paths, shared_data_values] = getSharedDataPathsAndValues(); for (size_t i = 0; i != num_paths; ++i) { auto path_size = unalignedLoad(pos); pos += sizeof(size_t); std::string_view path(pos, path_size); pos += path_size; - /// Check if it's a typed path. In this case we should use - /// deserializeAndInsertFromArena of corresponding column. - if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end()) + /// Deserialize binary value and try to insert it to dynamic paths or shared data. + auto value_size = unalignedLoad(pos); + pos += sizeof(size_t); + std::string_view value(pos, value_size); + pos += value_size; + /// Check if we have this path in dynamic paths. + if (auto dynamic_it = dynamic_paths.find(path); dynamic_it != dynamic_paths.end()) { - pos = typed_it->second->deserializeAndInsertFromArena(pos); + ReadBufferFromMemory buf(value.data(), value.size()); + getDynamicSerialization()->deserializeBinary(*dynamic_it->second, buf, getFormatSettings()); } - /// If it's not a typed path, deserialize binary value and try to insert it - /// to dynamic paths or shared data. + /// Try to add a new dynamic path. + else if (auto * dynamic_path_column = tryToAddNewDynamicPath(path)) + { + ReadBufferFromMemory buf(value.data(), value.size()); + getDynamicSerialization()->deserializeBinary(*dynamic_path_column, buf, getFormatSettings()); + } + /// Limit on dynamic paths is reached, add this path to shared data. + /// Serialized paths are sorted, so we can insert right away. else { - auto value_size = unalignedLoad(pos); - pos += sizeof(size_t); - std::string_view value(pos, value_size); - pos += value_size; - /// Check if we have this path in dynamic paths. - if (auto dynamic_it = dynamic_paths.find(path); dynamic_it != dynamic_paths.end()) - { - ReadBufferFromMemory buf(value.data(), value.size()); - getDynamicSerialization()->deserializeBinary(*dynamic_it->second, buf, getFormatSettings()); - } - /// Try to add a new dynamic path. - else if (auto * dynamic_path_column = tryToAddNewDynamicPath(path)) - { - ReadBufferFromMemory buf(value.data(), value.size()); - getDynamicSerialization()->deserializeBinary(*dynamic_path_column, buf, getFormatSettings()); - } - /// Limit on dynamic paths is reached, add this path to shared data later. - else - { - paths_and_values_for_shared_data.emplace_back(path, value); - } + shared_data_paths->insertData(path.data(), path.size()); + shared_data_values->insertData(value.data(), value.size()); } } - /// Sort and insert all paths from paths_and_values_for_shared_data into shared data. - std::sort(paths_and_values_for_shared_data.begin(), paths_and_values_for_shared_data.end()); - const auto [shared_data_paths, shared_data_values] = getSharedDataPathsAndValues(); - for (const auto & [path, value] : paths_and_values_for_shared_data) - { - shared_data_paths->insertData(path.data(), path.size()); - shared_data_values->insertData(value.data(), value.size()); - } - getSharedDataOffsets().push_back(shared_data_paths->size()); - /// Insert default value in all remaining typed and dynamic paths. - - for (auto & [_, column] : typed_paths) - { - if (column->size() == current_size) - column->insertDefault(); - } - + /// Insert default value in all remaining dynamic paths. for (auto & [_, column] : dynamic_paths_ptrs) { if (column->size() == current_size) @@ -786,6 +790,11 @@ const char * ColumnObject::deserializeAndInsertFromArena(const char * pos) const char * ColumnObject::skipSerializedInArena(const char * pos) const { + /// First, skip all values of typed paths; + for (auto path : sorted_typed_paths) + pos = typed_paths.find(path)->second->skipSerializedInArena(pos); + + /// Second, skip all other paths and values. auto num_paths = unalignedLoad(pos); pos += sizeof(size_t); for (size_t i = 0; i != num_paths; ++i) @@ -794,15 +803,8 @@ const char * ColumnObject::skipSerializedInArena(const char * pos) const pos += sizeof(size_t); std::string_view path(pos, path_size); pos += path_size; - if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end()) - { - pos = typed_it->second->skipSerializedInArena(pos); - } - else - { - auto value_size = unalignedLoad(pos); - pos += sizeof(size_t) + value_size; - } + auto value_size = unalignedLoad(pos); + pos += sizeof(size_t) + value_size; } return pos; @@ -810,11 +812,51 @@ const char * ColumnObject::skipSerializedInArena(const char * pos) const void ColumnObject::updateHashWithValue(size_t n, SipHash & hash) const { - for (const auto & [_, column] : typed_paths) - column->updateHashWithValue(n, hash); - for (const auto & [_, column] : dynamic_paths_ptrs) - column->updateHashWithValue(n, hash); - shared_data->updateHashWithValue(n, hash); + for (auto path : sorted_typed_paths) + typed_paths.find(path)->second->updateHashWithValue(n, hash); + + /// The hash of the object in row should not depend on the way we store paths (in dynamic paths or in shared data) + /// and should be the same for the same objects. To support it we update hash with path and its value (if not null) in + /// sorted by path order from both dynamic paths and shared data. + const auto [shared_data_paths, shared_data_values] = getSharedDataPathsAndValues(); + const auto & shared_data_offsets = getSharedDataOffsets(); + size_t start = shared_data_offsets[static_cast(n) - 1]; + size_t end = shared_data_offsets[static_cast(n)]; + auto dynamic_paths_it = sorted_dynamic_paths.begin(); + for (size_t i = start; i != end; ++i) + { + auto path = shared_data_paths->getDataAt(i).toView(); + /// Paths in shared data are sorted. Update hash with all paths from dynamic paths that go before this path in sorted order. + while (dynamic_paths_it != sorted_dynamic_paths.end() && *dynamic_paths_it < path) + { + const auto * dynamic_column = dynamic_paths_ptrs.find(*dynamic_paths_it)->second; + if (!dynamic_column->isNullAt(n)) + { + hash.update(*dynamic_paths_it); + dynamic_column->updateHashWithValue(n, hash); + } + ++dynamic_paths_it; + } + + /// Deserialize value in temporary column to get its hash. + auto value = shared_data_values->getDataAt(i); + ReadBufferFromMemory buf(value.data, value.size); + auto tmp_column = ColumnDynamic::create(); + getDynamicSerialization()->deserializeBinary(*tmp_column, buf, getFormatSettings()); + hash.update(path); + tmp_column->updateHashWithValue(0, hash); + } + + /// Iterate over all remaining paths in dynamic paths. + for (; dynamic_paths_it != sorted_dynamic_paths.end(); ++dynamic_paths_it) + { + const auto * dynamic_column = dynamic_paths_ptrs.find(*dynamic_paths_it)->second; + if (!dynamic_column->isNullAt(n)) + { + hash.update(*dynamic_paths_it); + dynamic_column->updateHashWithValue(n, hash); + } + } } WeakHash32 ColumnObject::getWeakHash32() const @@ -1328,8 +1370,9 @@ void ColumnObject::takeDynamicStructureFromSourceColumns(const DB::Columns & sou { if (dynamic_paths.size() < max_dynamic_paths) { - dynamic_paths.emplace(path, ColumnDynamic::create(max_dynamic_types)); - dynamic_paths_ptrs.emplace(path, assert_cast(dynamic_paths.find(path)->second.get())); + auto it = dynamic_paths.emplace(path, ColumnDynamic::create(max_dynamic_types)).first; + dynamic_paths_ptrs.emplace(path, assert_cast(it->second.get())); + sorted_dynamic_paths.insert(it->first); } /// Add all remaining paths into shared data statistics until we reach its max size; else if (new_statistics.shared_data_paths_statistics.size() < Statistics::MAX_SHARED_DATA_STATISTICS_SIZE) @@ -1343,8 +1386,9 @@ void ColumnObject::takeDynamicStructureFromSourceColumns(const DB::Columns & sou { for (const auto & [path, _] : path_to_total_number_of_non_null_values) { - dynamic_paths[path] = ColumnDynamic::create(max_dynamic_types); - dynamic_paths_ptrs[path] = assert_cast(dynamic_paths[path].get()); + auto it = dynamic_paths.emplace(path, ColumnDynamic::create(max_dynamic_types)).first; + dynamic_paths_ptrs[path] = assert_cast(it->second.get()); + sorted_dynamic_paths.insert(it->first); } } diff --git a/src/Columns/ColumnObject.h b/src/Columns/ColumnObject.h index f530ed29ef3..c7f282d9079 100644 --- a/src/Columns/ColumnObject.h +++ b/src/Columns/ColumnObject.h @@ -238,10 +238,15 @@ private: /// Map path -> column for paths with explicitly specified types. /// This set of paths is constant and cannot be changed. PathToColumnMap typed_paths; + /// Sorted list of typed paths. Used to avoid sorting paths every time in some methods. + std::vector sorted_typed_paths; /// Map path -> column for dynamically added paths. All columns /// here are Dynamic columns. This set of paths can be extended /// during inerts into the column. PathToColumnMap dynamic_paths; + /// Sorted list of dynamic paths. Used to avoid sorting paths every time in some methods. + std::set sorted_dynamic_paths; + /// Store and use pointers to ColumnDynamic to avoid virtual calls. /// With hundreds of dynamic paths these virtual calls are noticeable. PathToDynamicColumnPtrMap dynamic_paths_ptrs; diff --git a/tests/queries/0_stateless/03231_dynamic_uniq_group_by.reference b/tests/queries/0_stateless/03231_dynamic_uniq_group_by.reference new file mode 100644 index 00000000000..aafbd72ebc4 --- /dev/null +++ b/tests/queries/0_stateless/03231_dynamic_uniq_group_by.reference @@ -0,0 +1,5 @@ +4 +5 [1,2,3] +5 2020-01-01 +5 42 +5 Hello diff --git a/tests/queries/0_stateless/03231_dynamic_uniq_group_by.sql b/tests/queries/0_stateless/03231_dynamic_uniq_group_by.sql new file mode 100644 index 00000000000..fe052027f56 --- /dev/null +++ b/tests/queries/0_stateless/03231_dynamic_uniq_group_by.sql @@ -0,0 +1,15 @@ +set allow_experimental_dynamic_type = 1; +drop table if exists test; +create table test (d Dynamic(max_types=2)) engine=Memory; +insert into test values (42), ('Hello'), ([1,2,3]), ('2020-01-01'); +insert into test values ('Hello'), ([1,2,3]), ('2020-01-01'), (42); +insert into test values ([1,2,3]), ('2020-01-01'), (42), ('Hello'); +insert into test values ('2020-01-01'), (42), ('Hello'), ([1,2,3]); +insert into test values (42); +insert into test values ('Hello'); +insert into test values ([1,2,3]); +insert into test values ('2020-01-01'); + +select uniqExact(d) from test; +select count(), d from test group by d order by d; +drop table test; diff --git a/tests/queries/0_stateless/03232_json_uniq_group_by.reference b/tests/queries/0_stateless/03232_json_uniq_group_by.reference new file mode 100644 index 00000000000..1fc82458807 --- /dev/null +++ b/tests/queries/0_stateless/03232_json_uniq_group_by.reference @@ -0,0 +1,12 @@ +11 +6 {"a":0,"b":"Hello"} +6 {"a":0,"b":[{"f":"42"}]} +6 {"a":0,"c":"Hello"} +6 {"a":0,"c":["1","2","3"]} +6 {"a":0,"d":"2020-01-01"} +6 {"a":0,"d":["1","2","3"]} +6 {"a":0,"e":"2020-01-01"} +6 {"a":0,"e":[{"f":"42"}]} +5 {"a":42,"b":"Hello","c":["1","2","3"],"d":"2020-01-01","e":[{"f":"42"}]} +5 {"a":42,"b":[{"f":"42"}],"c":"Hello","d":["1","2","3"],"e":"2020-01-01"} +12 {"a":42} diff --git a/tests/queries/0_stateless/03232_json_uniq_group_by.sql b/tests/queries/0_stateless/03232_json_uniq_group_by.sql new file mode 100644 index 00000000000..5d39390d6e1 --- /dev/null +++ b/tests/queries/0_stateless/03232_json_uniq_group_by.sql @@ -0,0 +1,39 @@ +set allow_experimental_json_type = 1; +drop table if exists test; +create table test (json JSON(a UInt32, max_dynamic_paths=2)) engine=Memory; +insert into test values ('{"a" : 42, "b" : "Hello", "c" : [1, 2, 3], "d" : "2020-01-01", "e" : [{"f" : 42}]}'); +insert into test values ('{"b" : "Hello", "c" : [1, 2, 3], "d" : "2020-01-01", "e" : [{"f" : 42}], "a" : 42}'); +insert into test values ('{"c" : [1, 2, 3], "d" : "2020-01-01", "e" : [{"f" : 42}], "a" : 42, "b" : "Hello"}'); +insert into test values ('{"d" : "2020-01-01", "e" : [{"f" : 42}], "a" : 42, "b" : "Hello", "c" : [1, 2, 3]}'); +insert into test values ('{"e" : [{"f" : 42}], "a" : 42, "b" : "Hello", "c" : [1, 2, 3], "d" : "2020-01-01"}'); +insert into test values ('{"a" : 42}'), ('{"b" : "Hello"}'), ('{"c" : [1, 2, 3]}'), ('{"d" : "2020-01-01"}'), ('{"e" : [{"f" : 42}]}'); +insert into test values ('{"b" : "Hello"}'), ('{"c" : [1, 2, 3]}'), ('{"d" : "2020-01-01"}'), ('{"e" : [{"f" : 42}]}'), ('{"a" : 42}'); +insert into test values ('{"c" : [1, 2, 3]}'), ('{"d" : "2020-01-01"}'), ('{"e" : [{"f" : 42}]}'), ('{"a" : 42}'), ('{"b" : "Hello"}'); +insert into test values ('{"d" : "2020-01-01"}'), ('{"e" : [{"f" : 42}]}'), ('{"a" : 42}'), ('{"b" : "Hello"}'), ('{"c" : [1, 2, 3]}'); +insert into test values ('{"e" : [{"f" : 42}]}'), ('{"a" : 42}'), ('{"b" : "Hello"}'), ('{"c" : [1, 2, 3]}'), ('{"d" : "2020-01-01"}'); +insert into test values ('{"a" : 42}'); +insert into test values ('{"b" : "Hello"}'); +insert into test values ('{"c" : [1, 2, 3]}'); +insert into test values ('{"d" : "2020-01-01"}'); +insert into test values ('{"e" : [{"f" : 42}]}'); + +insert into test values ('{"a" : 42, "c" : "Hello", "d" : [1, 2, 3], "e" : "2020-01-01", "b" : [{"f" : 42}]}'); +insert into test values ('{"c" : "Hello", "d" : [1, 2, 3], "e" : "2020-01-01", "b" : [{"f" : 42}], "a" : 42}'); +insert into test values ('{"d" : [1, 2, 3], "e" : "2020-01-01", "b" : [{"f" : 42}], "a" : 42, "c" : "Hello"}'); +insert into test values ('{"e" : "2020-01-01", "b" : [{"f" : 42}], "a" : 42, "c" : "Hello", "d" : [1, 2, 3]}'); +insert into test values ('{"b" : [{"f" : 42}], "a" : 42, "c" : "Hello", "d" : [1, 2, 3], "e" : "2020-01-01"}'); +insert into test values ('{"a" : 42}'), ('{"c" : "Hello"}'), ('{"d" : [1, 2, 3]}'), ('{"e" : "2020-01-01"}'), ('{"b" : [{"f" : 42}]}'); +insert into test values ('{"c" : "Hello"}'), ('{"d" : [1, 2, 3]}'), ('{"e" : "2020-01-01"}'), ('{"b" : [{"f" : 42}]}'), ('{"a" : 42}'); +insert into test values ('{"d" : [1, 2, 3]}'), ('{"e" : "2020-01-01"}'), ('{"b" : [{"f" : 42}]}'), ('{"a" : 42}'), ('{"c" : "Hello"}'); +insert into test values ('{"e" : "2020-01-01"}'), ('{"b" : [{"f" : 42}]}'), ('{"a" : 42}'), ('{"c" : "Hello"}'), ('{"d" : [1, 2, 3]}'); +insert into test values ('{"b" : [{"f" : 42}]}'), ('{"a" : 42}'), ('{"c" : "Hello"}'), ('{"d" : [1, 2, 3]}'), ('{"e" : "2020-01-01"}'); +insert into test values ('{"a" : 42}'); +insert into test values ('{"c" : "Hello"}'); +insert into test values ('{"d" : [1, 2, 3]}'); +insert into test values ('{"e" : "2020-01-01"}'); +insert into test values ('{"b" : [{"f" : 42}]}'); + +select uniqExact(json) from test; +select count(), json from test group by json order by toString(json); + +drop table test; From a44b3d02681e5aeb4f1584483928a920b5843bcf Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 3 Sep 2024 17:31:07 +0000 Subject: [PATCH 2/3] Fix sorted typed paths --- src/Columns/ColumnObject.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index 920b0384448..d3af9812b5c 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -53,6 +53,7 @@ ColumnObject::ColumnObject( auto it = typed_paths.emplace(path, std::move(column)).first; sorted_typed_paths.push_back(it->first); } + std::sort(sorted_typed_paths.begin(), sorted_typed_paths.end()); dynamic_paths.reserve(dynamic_paths_.size()); dynamic_paths_ptrs.reserve(dynamic_paths_.size()); @@ -69,6 +70,7 @@ ColumnObject::ColumnObject( : max_dynamic_paths(max_dynamic_paths_), global_max_dynamic_paths(max_dynamic_paths_), max_dynamic_types(max_dynamic_types_) { typed_paths.reserve(typed_paths_.size()); + sorted_typed_paths.reserve(typed_paths_.size()); for (auto & [path, column] : typed_paths_) { if (!column->empty()) @@ -77,6 +79,8 @@ ColumnObject::ColumnObject( sorted_typed_paths.push_back(it->first); } + std::sort(sorted_typed_paths.begin(), sorted_typed_paths.end()); + MutableColumns paths_and_values; paths_and_values.emplace_back(ColumnString::create()); paths_and_values.emplace_back(ColumnString::create()); From 228ac44a92c720704aef7d62d990fa3696949def Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 18 Sep 2024 21:27:38 +0200 Subject: [PATCH 3/3] Fix asan issue --- src/Columns/ColumnObject.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index d3af9812b5c..3577ab1ec82 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -1356,6 +1356,7 @@ void ColumnObject::takeDynamicStructureFromSourceColumns(const DB::Columns & sou /// Reset current state. dynamic_paths.clear(); dynamic_paths_ptrs.clear(); + sorted_dynamic_paths.clear(); max_dynamic_paths = global_max_dynamic_paths; Statistics new_statistics(Statistics::Source::MERGE);