diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index 1f16c12f6ba..4ab4a4e441a 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -243,7 +244,7 @@ void ColumnObject::insertData(const char *, size_t) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertData is not supported for {}", getName()); } -ColumnDynamic * ColumnObject::tryToAddNewDynamicPath(const String & path) +ColumnDynamic * ColumnObject::tryToAddNewDynamicPath(const std::string_view path) { if (dynamic_paths.size() == max_dynamic_paths) return nullptr; @@ -435,7 +436,7 @@ void ColumnObject::doInsertFrom(const IColumn & src, size_t n) /// Second, insert dynamic paths and extend them if needed. /// We can reach the limit of dynamic paths, and in this case /// the rest of dynamic paths will be inserted into shared data. - std::vector src_dynamic_paths_for_shared_data; + std::vector src_dynamic_paths_for_shared_data; for (const auto & [path, column] : src_object_column.dynamic_paths) { /// Check if we already have such dynamic path. @@ -469,7 +470,7 @@ void ColumnObject::doInsertRangeFrom(const IColumn & src, size_t start, size_t l /// Second, insert dynamic paths and extend them if needed. /// We can reach the limit of dynamic paths, and in this case /// the rest of dynamic paths will be inserted into shared data. - std::vector src_dynamic_paths_for_shared_data; + std::vector src_dynamic_paths_for_shared_data; for (const auto & [path, column] : src_object_column.dynamic_paths) { /// Check if we already have such dynamic path. @@ -487,7 +488,7 @@ void ColumnObject::doInsertRangeFrom(const IColumn & src, size_t start, size_t l insertFromSharedDataAndFillRemainingDynamicPaths(src_object_column, std::move(src_dynamic_paths_for_shared_data), start, length); } -void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::ColumnObject & src_object_column, std::vector && src_dynamic_paths_for_shared_data, size_t start, size_t length) +void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::ColumnObject & src_object_column, std::vector && src_dynamic_paths_for_shared_data, size_t start, size_t length) { /// Paths in shared data are sorted, so paths from src_dynamic_paths_for_shared_data should be inserted properly /// to keep paths sorted. Let's sort them in advance. @@ -512,8 +513,8 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co for (size_t i = start; i != start + length; ++i) { /// Paths in src_dynamic_paths_for_shared_data are already sorted. - for (const auto & path : src_dynamic_paths_for_shared_data) - serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, path, *src_object_column.dynamic_paths.at(path), i); + for (const auto path : src_dynamic_paths_for_shared_data) + serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, path, *src_object_column.dynamic_paths.find(path)->second, i); shared_data_offsets.push_back(shared_data_paths->size()); } } @@ -541,9 +542,9 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co size_t end = src_shared_data_offsets[row]; for (size_t i = offset; i != end; ++i) { - auto path = src_shared_data_paths->getDataAt(i); + auto path = src_shared_data_paths->getDataAt(i).toView(); /// Check if we have this path in dynamic paths. - if (auto it = dynamic_paths_ptrs.find(path.toString()); it != dynamic_paths_ptrs.end()) + if (auto it = dynamic_paths_ptrs.find(path); it != dynamic_paths_ptrs.end()) { /// Deserialize binary value into dynamic column from shared data. deserializeValueFromSharedData(src_shared_data_values, i, *it->second); @@ -555,8 +556,8 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co while (src_dynamic_paths_for_shared_data_index < src_dynamic_paths_for_shared_data.size() && src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index] < path) { - const auto & dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index]; - serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.at(dynamic_path), row); + const auto dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index]; + serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.find(dynamic_path)->second, row); ++src_dynamic_paths_for_shared_data_index; } @@ -569,8 +570,8 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co /// Insert remaining dynamic paths from src_dynamic_paths_for_shared_data. for (; src_dynamic_paths_for_shared_data_index != src_dynamic_paths_for_shared_data.size(); ++src_dynamic_paths_for_shared_data_index) { - const auto & dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index]; - serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.at(dynamic_path), row); + const auto dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index]; + serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.find(dynamic_path)->second, row); } shared_data_offsets.push_back(shared_data_paths->size()); @@ -584,7 +585,7 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co } } -void ColumnObject::serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const String & path, const IColumn & column, size_t n) +void ColumnObject::serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const std::string_view path, const IColumn & column, size_t n) { /// Don't store Null values in shared data. We consider Null value equivalent to the absence /// of this path in the row because we cannot distinguish these 2 cases for dynamic paths. @@ -700,11 +701,10 @@ const char * ColumnObject::deserializeAndInsertFromArena(const char * pos) auto path_size = unalignedLoad(pos); pos += sizeof(size_t); std::string_view path(pos, path_size); - String path_str(path); pos += path_size; /// Check if it's a typed path. In this case we should use /// deserializeAndInsertFromArena of corresponding column. - if (auto typed_it = typed_paths.find(path_str); typed_it != typed_paths.end()) + if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end()) { pos = typed_it->second->deserializeAndInsertFromArena(pos); } @@ -712,19 +712,18 @@ const char * ColumnObject::deserializeAndInsertFromArena(const char * pos) /// to dynamic paths or shared data. else { - auto value_size = unalignedLoad(pos); pos += sizeof(size_t); std::string_view value(pos, value_size); pos += value_size; /// Check if we have this path in dynamic paths. - if (auto dynamic_it = dynamic_paths.find(path_str); dynamic_it != dynamic_paths.end()) + if (auto dynamic_it = dynamic_paths.find(path); dynamic_it != dynamic_paths.end()) { ReadBufferFromMemory buf(value.data(), value.size()); getDynamicSerialization()->deserializeBinary(*dynamic_it->second, buf, getFormatSettings()); } /// Try to add a new dynamic path. - else if (auto * dynamic_path_column = tryToAddNewDynamicPath(path_str)) + else if (auto * dynamic_path_column = tryToAddNewDynamicPath(path)) { ReadBufferFromMemory buf(value.data(), value.size()); getDynamicSerialization()->deserializeBinary(*dynamic_path_column, buf, getFormatSettings()); @@ -773,7 +772,7 @@ const char * ColumnObject::skipSerializedInArena(const char * pos) const { auto path_size = unalignedLoad(pos); pos += sizeof(size_t); - String path(pos, path_size); + std::string_view path(pos, path_size); pos += path_size; if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end()) { @@ -1167,7 +1166,7 @@ void ColumnObject::takeDynamicStructureFromSourceColumns(const DB::Columns & sou if (path_to_total_number_of_non_null_values.size() > max_dynamic_paths) { /// Sort paths by total_number_of_non_null_values. - std::vector> paths_with_sizes; + std::vector> paths_with_sizes; paths_with_sizes.reserve(path_to_total_number_of_non_null_values.size()); for (const auto & [path, size] : path_to_total_number_of_non_null_values) paths_with_sizes.emplace_back(size, path); @@ -1176,8 +1175,8 @@ void ColumnObject::takeDynamicStructureFromSourceColumns(const DB::Columns & sou /// Fill dynamic_paths with first max_dynamic_paths paths in sorted list. for (size_t i = 0; i != max_dynamic_paths; ++i) { - dynamic_paths[paths_with_sizes[i].second] = ColumnDynamic::create(max_dynamic_types); - dynamic_paths_ptrs[paths_with_sizes[i].second] = assert_cast(dynamic_paths[paths_with_sizes[i].second].get()); + dynamic_paths.emplace(paths_with_sizes[i].second, ColumnDynamic::create(max_dynamic_types)); + dynamic_paths_ptrs.emplace(paths_with_sizes[i].second, assert_cast(dynamic_paths.find(paths_with_sizes[i].second)->second.get())); } } /// Use all dynamic paths from all source columns. diff --git a/src/Columns/ColumnObject.h b/src/Columns/ColumnObject.h index fbb68897e08..ecb6c4e0e15 100644 --- a/src/Columns/ColumnObject.h +++ b/src/Columns/ColumnObject.h @@ -9,7 +9,7 @@ #include #include #include - +#include #include namespace DB @@ -44,6 +44,9 @@ private: size_t max_dynamic_types_, const Statistics & statistics_ = {}); + /// Use StringHashForHeterogeneousLookup hash for hash maps to be able to use std::string_view in find() method. + using PathToColumnMap = std::unordered_map; + using PathToDynamicColumnPtrMap = std::unordered_map; public: /** Create immutable column using immutable arguments. This arguments may be shared with other columns. * Use mutate in order to make mutable column and mutate shared nested columns. @@ -158,14 +161,14 @@ public: bool hasDynamicStructure() const override { return true; } void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override; - const std::unordered_map & getTypedPaths() const { return typed_paths; } - std::unordered_map & getTypedPaths() { return typed_paths; } + const PathToColumnMap & getTypedPaths() const { return typed_paths; } + PathToColumnMap & getTypedPaths() { return typed_paths; } - const std::unordered_map & getDynamicPaths() const { return dynamic_paths; } - std::unordered_map & getDynamicPaths() { return dynamic_paths; } + const PathToColumnMap & getDynamicPaths() const { return dynamic_paths; } + PathToColumnMap & getDynamicPaths() { return dynamic_paths; } - const std::unordered_map & getDynamicPathsPtrs() const { return dynamic_paths_ptrs; } - std::unordered_map & getDynamicPathsPtrs() { return dynamic_paths_ptrs; } + const PathToDynamicColumnPtrMap & getDynamicPathsPtrs() const { return dynamic_paths_ptrs; } + PathToDynamicColumnPtrMap & getDynamicPathsPtrs() { return dynamic_paths_ptrs; } const Statistics & getStatistics() const { return statistics; } @@ -198,12 +201,12 @@ public: /// Try to add new dynamic path. Returns pointer to the new dynamic /// path column or nullptr if limit on dynamic paths is reached. - ColumnDynamic * tryToAddNewDynamicPath(const String & path); + ColumnDynamic * tryToAddNewDynamicPath(const std::string_view path); void setDynamicPaths(const std::vector & paths); void setStatistics(const Statistics & statistics_) { statistics = statistics_; } - void serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const String & path, const IColumn & column, size_t n); + void serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const std::string_view path, const IColumn & column, size_t n); void deserializeValueFromSharedData(const ColumnString * shared_data_values, size_t n, IColumn & column) const; /// Paths in shared data are sorted in each row. Use this method to find the lower bound for specific path in the row. @@ -212,19 +215,19 @@ public: static void fillPathColumnFromSharedData(IColumn & path_column, StringRef path, const ColumnPtr & shared_data_column, size_t start, size_t end); private: - void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector && src_dynamic_paths_for_shared_data, size_t start, size_t length); + void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector && src_dynamic_paths_for_shared_data, size_t start, size_t length); void serializePathAndValueIntoArena(Arena & arena, const char *& begin, StringRef path, StringRef value, StringRef & res) const; /// Map path -> column for paths with explicitly specified types. /// This set of paths is constant and cannot be changed. - std::unordered_map typed_paths; + PathToColumnMap typed_paths; /// Map path -> column for dynamically added paths. All columns /// here are Dynamic columns. This set of paths can be extended /// during inerts into the column. - std::unordered_map dynamic_paths; + PathToColumnMap dynamic_paths; /// Store and use pointers to ColumnDynamic to avoid virtual calls. /// With hundreds of dynamic paths these virtual calls are noticeable. - std::unordered_map dynamic_paths_ptrs; + PathToDynamicColumnPtrMap dynamic_paths_ptrs; /// Shared storage for all other paths and values. It's filled /// when the number of dynamic paths reaches the limit. /// It has type Array(Tuple(String, String)) and stores diff --git a/src/Common/StringHashForHeterogeneousLookup.h b/src/Common/StringHashForHeterogeneousLookup.h new file mode 100644 index 00000000000..0983fd460d6 --- /dev/null +++ b/src/Common/StringHashForHeterogeneousLookup.h @@ -0,0 +1,25 @@ +#pragma once +#include + +namespace DB +{ + +/// See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r3.html +struct StringHashForHeterogeneousLookup +{ + using hash_type = std::hash; + using transparent_key_equal = std::equal_to<>; + using is_transparent = void; // required to make find() work with different type than key_type + + auto operator()(const std::string_view view) const + { + return hash_type()(view); + } + + auto operator()(const std::string & str) const + { + return hash_type()(str); + } +}; + +} diff --git a/src/Functions/JSONPaths.cpp b/src/Functions/JSONPaths.cpp index 4a84cec711b..35613e40aac 100644 --- a/src/Functions/JSONPaths.cpp +++ b/src/Functions/JSONPaths.cpp @@ -139,7 +139,7 @@ private: { /// Collect all dynamic paths. const auto & dynamic_path_columns = column_object.getDynamicPaths(); - std::vector dynamic_paths; + std::vector dynamic_paths; dynamic_paths.reserve(dynamic_path_columns.size()); for (const auto & [path, _] : dynamic_path_columns) dynamic_paths.push_back(path); @@ -149,11 +149,11 @@ private: size_t size = column_object.size(); for (size_t i = 0; i != size; ++i) { - for (const auto & path : dynamic_paths) + for (const auto path : dynamic_paths) { /// Don't include path if it contains NULL, because we consider /// it to be equivalent to the absence of this path in this row. - if (!dynamic_path_columns.at(path)->isNullAt(i)) + if (!dynamic_path_columns.find(path)->second->isNullAt(i)) data.insertData(path.data(), path.size()); } offsets.push_back(data.size()); @@ -162,7 +162,7 @@ private: } /// Collect all paths: typed, dynamic and paths from shared data. - std::vector sorted_dynamic_and_typed_paths; + std::vector sorted_dynamic_and_typed_paths; const auto & typed_path_columns = column_object.getTypedPaths(); const auto & dynamic_path_columns = column_object.getDynamicPaths(); sorted_dynamic_and_typed_paths.reserve(typed_path_columns.size() + dynamic_path_columns.size()); @@ -184,22 +184,22 @@ private: size_t sorted_paths_index = 0; for (size_t j = start; j != end; ++j) { - auto shared_data_path = shared_data_paths->getDataAt(j); + auto shared_data_path = shared_data_paths->getDataAt(j).toView(); while (sorted_paths_index != sorted_dynamic_and_typed_paths.size() && sorted_dynamic_and_typed_paths[sorted_paths_index] < shared_data_path) { - const auto & path = sorted_dynamic_and_typed_paths[sorted_paths_index]; + const auto path = sorted_dynamic_and_typed_paths[sorted_paths_index]; /// If it's dynamic path include it only if it's not NULL. if (auto it = dynamic_path_columns.find(path); it == dynamic_path_columns.end() || !it->second->isNullAt(i)) data.insertData(path.data(), path.size()); ++sorted_paths_index; } - data.insertData(shared_data_path.data, shared_data_path.size); + data.insertData(shared_data_path.data(), shared_data_path.size()); } for (; sorted_paths_index != sorted_dynamic_and_typed_paths.size(); ++sorted_paths_index) { - const auto & path = sorted_dynamic_and_typed_paths[sorted_paths_index]; + const auto path = sorted_dynamic_and_typed_paths[sorted_paths_index]; if (auto it = dynamic_path_columns.find(path); it == dynamic_path_columns.end() || !it->second->isNullAt(i)) data.insertData(path.data(), path.size()); } @@ -220,7 +220,7 @@ private: if constexpr (Impl::paths_mode == PathsMode::DYNAMIC_PATHS) { const auto & dynamic_path_columns = column_object.getDynamicPaths(); - std::vector sorted_dynamic_paths; + std::vector sorted_dynamic_paths; sorted_dynamic_paths.reserve(dynamic_path_columns.size()); for (const auto & [path, _] : dynamic_path_columns) sorted_dynamic_paths.push_back(path); @@ -230,9 +230,9 @@ private: /// Iterate over all rows and extract types from dynamic columns. for (size_t i = 0; i != column_object.size(); ++i) { - for (auto & path : sorted_dynamic_paths) + for (const auto path : sorted_dynamic_paths) { - auto column = dynamic_path_columns.at(path); + const auto & column = dynamic_path_columns.find(path)->second; if (!column->isNullAt(i)) { auto type = getDynamicValueType(column, i); @@ -272,7 +272,7 @@ private: } /// Iterate over all rows and extract types from dynamic columns from dynamic paths and from values in shared data. - std::vector> sorted_typed_and_dynamic_paths_with_types; + std::vector> sorted_typed_and_dynamic_paths_with_types; const auto & typed_path_types = type_object.getTypedPaths(); const auto & dynamic_path_columns = column_object.getDynamicPaths(); sorted_typed_and_dynamic_paths_with_types.reserve(typed_path_types.size() + dynamic_path_columns.size()); @@ -294,7 +294,7 @@ private: size_t sorted_paths_index = 0; for (size_t j = start; j != end; ++j) { - auto shared_data_path = shared_data_paths->getDataAt(j); + auto shared_data_path = shared_data_paths->getDataAt(j).toView(); auto type_name = getDynamicValueTypeFromSharedData(shared_data_values->getDataAt(j)); /// Skip NULL values. if (!type_name) @@ -319,7 +319,7 @@ private: ++sorted_paths_index; } - paths_column->insertData(shared_data_path.data, shared_data_path.size); + paths_column->insertData(shared_data_path.data(), shared_data_path.size()); types_column->insertData(type_name->data(), type_name->size()); }