Use std::string_view for lookups in hash table

This commit is contained in:
avogar 2024-08-12 15:52:39 +00:00
parent 8522776c33
commit 44d4784da5
4 changed files with 76 additions and 49 deletions

View File

@ -5,6 +5,7 @@
#include <IO/WriteBufferFromString.h>
#include <IO/ReadBufferFromString.h>
#include <Common/Arena.h>
#include <Common/StringHashForHeterogeneousLookup.h>
namespace DB
{
@ -243,7 +244,7 @@ void ColumnObject::insertData(const char *, size_t)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertData is not supported for {}", getName());
}
ColumnDynamic * ColumnObject::tryToAddNewDynamicPath(const String & path)
ColumnDynamic * ColumnObject::tryToAddNewDynamicPath(const std::string_view path)
{
if (dynamic_paths.size() == max_dynamic_paths)
return nullptr;
@ -435,7 +436,7 @@ void ColumnObject::doInsertFrom(const IColumn & src, size_t n)
/// Second, insert dynamic paths and extend them if needed.
/// We can reach the limit of dynamic paths, and in this case
/// the rest of dynamic paths will be inserted into shared data.
std::vector<String> src_dynamic_paths_for_shared_data;
std::vector<std::string_view> src_dynamic_paths_for_shared_data;
for (const auto & [path, column] : src_object_column.dynamic_paths)
{
/// Check if we already have such dynamic path.
@ -469,7 +470,7 @@ void ColumnObject::doInsertRangeFrom(const IColumn & src, size_t start, size_t l
/// Second, insert dynamic paths and extend them if needed.
/// We can reach the limit of dynamic paths, and in this case
/// the rest of dynamic paths will be inserted into shared data.
std::vector<String> src_dynamic_paths_for_shared_data;
std::vector<std::string_view> src_dynamic_paths_for_shared_data;
for (const auto & [path, column] : src_object_column.dynamic_paths)
{
/// Check if we already have such dynamic path.
@ -487,7 +488,7 @@ void ColumnObject::doInsertRangeFrom(const IColumn & src, size_t start, size_t l
insertFromSharedDataAndFillRemainingDynamicPaths(src_object_column, std::move(src_dynamic_paths_for_shared_data), start, length);
}
void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::ColumnObject & src_object_column, std::vector<String> && src_dynamic_paths_for_shared_data, size_t start, size_t length)
void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::ColumnObject & src_object_column, std::vector<std::string_view> && src_dynamic_paths_for_shared_data, size_t start, size_t length)
{
/// Paths in shared data are sorted, so paths from src_dynamic_paths_for_shared_data should be inserted properly
/// to keep paths sorted. Let's sort them in advance.
@ -512,8 +513,8 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co
for (size_t i = start; i != start + length; ++i)
{
/// Paths in src_dynamic_paths_for_shared_data are already sorted.
for (const auto & path : src_dynamic_paths_for_shared_data)
serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, path, *src_object_column.dynamic_paths.at(path), i);
for (const auto path : src_dynamic_paths_for_shared_data)
serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, path, *src_object_column.dynamic_paths.find(path)->second, i);
shared_data_offsets.push_back(shared_data_paths->size());
}
}
@ -541,9 +542,9 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co
size_t end = src_shared_data_offsets[row];
for (size_t i = offset; i != end; ++i)
{
auto path = src_shared_data_paths->getDataAt(i);
auto path = src_shared_data_paths->getDataAt(i).toView();
/// Check if we have this path in dynamic paths.
if (auto it = dynamic_paths_ptrs.find(path.toString()); it != dynamic_paths_ptrs.end())
if (auto it = dynamic_paths_ptrs.find(path); it != dynamic_paths_ptrs.end())
{
/// Deserialize binary value into dynamic column from shared data.
deserializeValueFromSharedData(src_shared_data_values, i, *it->second);
@ -555,8 +556,8 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co
while (src_dynamic_paths_for_shared_data_index < src_dynamic_paths_for_shared_data.size()
&& src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index] < path)
{
const auto & dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index];
serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.at(dynamic_path), row);
const auto dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index];
serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.find(dynamic_path)->second, row);
++src_dynamic_paths_for_shared_data_index;
}
@ -569,8 +570,8 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co
/// Insert remaining dynamic paths from src_dynamic_paths_for_shared_data.
for (; src_dynamic_paths_for_shared_data_index != src_dynamic_paths_for_shared_data.size(); ++src_dynamic_paths_for_shared_data_index)
{
const auto & dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index];
serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.at(dynamic_path), row);
const auto dynamic_path = src_dynamic_paths_for_shared_data[src_dynamic_paths_for_shared_data_index];
serializePathAndValueIntoSharedData(shared_data_paths, shared_data_values, dynamic_path, *src_object_column.dynamic_paths.find(dynamic_path)->second, row);
}
shared_data_offsets.push_back(shared_data_paths->size());
@ -584,7 +585,7 @@ void ColumnObject::insertFromSharedDataAndFillRemainingDynamicPaths(const DB::Co
}
}
void ColumnObject::serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const String & path, const IColumn & column, size_t n)
void ColumnObject::serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const std::string_view path, const IColumn & column, size_t n)
{
/// Don't store Null values in shared data. We consider Null value equivalent to the absence
/// of this path in the row because we cannot distinguish these 2 cases for dynamic paths.
@ -700,11 +701,10 @@ const char * ColumnObject::deserializeAndInsertFromArena(const char * pos)
auto path_size = unalignedLoad<size_t>(pos);
pos += sizeof(size_t);
std::string_view path(pos, path_size);
String path_str(path);
pos += path_size;
/// Check if it's a typed path. In this case we should use
/// deserializeAndInsertFromArena of corresponding column.
if (auto typed_it = typed_paths.find(path_str); typed_it != typed_paths.end())
if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end())
{
pos = typed_it->second->deserializeAndInsertFromArena(pos);
}
@ -712,19 +712,18 @@ const char * ColumnObject::deserializeAndInsertFromArena(const char * pos)
/// to dynamic paths or shared data.
else
{
auto value_size = unalignedLoad<size_t>(pos);
pos += sizeof(size_t);
std::string_view value(pos, value_size);
pos += value_size;
/// Check if we have this path in dynamic paths.
if (auto dynamic_it = dynamic_paths.find(path_str); dynamic_it != dynamic_paths.end())
if (auto dynamic_it = dynamic_paths.find(path); dynamic_it != dynamic_paths.end())
{
ReadBufferFromMemory buf(value.data(), value.size());
getDynamicSerialization()->deserializeBinary(*dynamic_it->second, buf, getFormatSettings());
}
/// Try to add a new dynamic path.
else if (auto * dynamic_path_column = tryToAddNewDynamicPath(path_str))
else if (auto * dynamic_path_column = tryToAddNewDynamicPath(path))
{
ReadBufferFromMemory buf(value.data(), value.size());
getDynamicSerialization()->deserializeBinary(*dynamic_path_column, buf, getFormatSettings());
@ -773,7 +772,7 @@ const char * ColumnObject::skipSerializedInArena(const char * pos) const
{
auto path_size = unalignedLoad<size_t>(pos);
pos += sizeof(size_t);
String path(pos, path_size);
std::string_view path(pos, path_size);
pos += path_size;
if (auto typed_it = typed_paths.find(path); typed_it != typed_paths.end())
{
@ -1167,7 +1166,7 @@ void ColumnObject::takeDynamicStructureFromSourceColumns(const DB::Columns & sou
if (path_to_total_number_of_non_null_values.size() > max_dynamic_paths)
{
/// Sort paths by total_number_of_non_null_values.
std::vector<std::pair<size_t, String>> paths_with_sizes;
std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
paths_with_sizes.reserve(path_to_total_number_of_non_null_values.size());
for (const auto & [path, size] : path_to_total_number_of_non_null_values)
paths_with_sizes.emplace_back(size, path);
@ -1176,8 +1175,8 @@ void ColumnObject::takeDynamicStructureFromSourceColumns(const DB::Columns & sou
/// Fill dynamic_paths with first max_dynamic_paths paths in sorted list.
for (size_t i = 0; i != max_dynamic_paths; ++i)
{
dynamic_paths[paths_with_sizes[i].second] = ColumnDynamic::create(max_dynamic_types);
dynamic_paths_ptrs[paths_with_sizes[i].second] = assert_cast<ColumnDynamic *>(dynamic_paths[paths_with_sizes[i].second].get());
dynamic_paths.emplace(paths_with_sizes[i].second, ColumnDynamic::create(max_dynamic_types));
dynamic_paths_ptrs.emplace(paths_with_sizes[i].second, assert_cast<ColumnDynamic *>(dynamic_paths.find(paths_with_sizes[i].second)->second.get()));
}
}
/// Use all dynamic paths from all source columns.

View File

@ -9,7 +9,7 @@
#include <DataTypes/IDataType.h>
#include <DataTypes/Serializations/SerializationDynamic.h>
#include <Formats/FormatSettings.h>
#include <Common/StringHashForHeterogeneousLookup.h>
#include <Common/WeakHash.h>
namespace DB
@ -44,6 +44,9 @@ private:
size_t max_dynamic_types_,
const Statistics & statistics_ = {});
/// Use StringHashForHeterogeneousLookup hash for hash maps to be able to use std::string_view in find() method.
using PathToColumnMap = std::unordered_map<String, WrappedPtr, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
using PathToDynamicColumnPtrMap = std::unordered_map<String, ColumnDynamic *, StringHashForHeterogeneousLookup, StringHashForHeterogeneousLookup::transparent_key_equal>;
public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use mutate in order to make mutable column and mutate shared nested columns.
@ -158,14 +161,14 @@ public:
bool hasDynamicStructure() const override { return true; }
void takeDynamicStructureFromSourceColumns(const Columns & source_columns) override;
const std::unordered_map<String, WrappedPtr> & getTypedPaths() const { return typed_paths; }
std::unordered_map<String, WrappedPtr> & getTypedPaths() { return typed_paths; }
const PathToColumnMap & getTypedPaths() const { return typed_paths; }
PathToColumnMap & getTypedPaths() { return typed_paths; }
const std::unordered_map<String, WrappedPtr> & getDynamicPaths() const { return dynamic_paths; }
std::unordered_map<String, WrappedPtr> & getDynamicPaths() { return dynamic_paths; }
const PathToColumnMap & getDynamicPaths() const { return dynamic_paths; }
PathToColumnMap & getDynamicPaths() { return dynamic_paths; }
const std::unordered_map<String, ColumnDynamic *> & getDynamicPathsPtrs() const { return dynamic_paths_ptrs; }
std::unordered_map<String, ColumnDynamic *> & getDynamicPathsPtrs() { return dynamic_paths_ptrs; }
const PathToDynamicColumnPtrMap & getDynamicPathsPtrs() const { return dynamic_paths_ptrs; }
PathToDynamicColumnPtrMap & getDynamicPathsPtrs() { return dynamic_paths_ptrs; }
const Statistics & getStatistics() const { return statistics; }
@ -198,12 +201,12 @@ public:
/// Try to add new dynamic path. Returns pointer to the new dynamic
/// path column or nullptr if limit on dynamic paths is reached.
ColumnDynamic * tryToAddNewDynamicPath(const String & path);
ColumnDynamic * tryToAddNewDynamicPath(const std::string_view path);
void setDynamicPaths(const std::vector<String> & paths);
void setStatistics(const Statistics & statistics_) { statistics = statistics_; }
void serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const String & path, const IColumn & column, size_t n);
void serializePathAndValueIntoSharedData(ColumnString * shared_data_paths, ColumnString * shared_data_values, const std::string_view path, const IColumn & column, size_t n);
void deserializeValueFromSharedData(const ColumnString * shared_data_values, size_t n, IColumn & column) const;
/// Paths in shared data are sorted in each row. Use this method to find the lower bound for specific path in the row.
@ -212,19 +215,19 @@ public:
static void fillPathColumnFromSharedData(IColumn & path_column, StringRef path, const ColumnPtr & shared_data_column, size_t start, size_t end);
private:
void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector<String> && src_dynamic_paths_for_shared_data, size_t start, size_t length);
void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector<std::string_view> && src_dynamic_paths_for_shared_data, size_t start, size_t length);
void serializePathAndValueIntoArena(Arena & arena, const char *& begin, StringRef path, StringRef value, StringRef & res) const;
/// Map path -> column for paths with explicitly specified types.
/// This set of paths is constant and cannot be changed.
std::unordered_map<String, WrappedPtr> typed_paths;
PathToColumnMap typed_paths;
/// Map path -> column for dynamically added paths. All columns
/// here are Dynamic columns. This set of paths can be extended
/// during inerts into the column.
std::unordered_map<String, WrappedPtr> dynamic_paths;
PathToColumnMap dynamic_paths;
/// Store and use pointers to ColumnDynamic to avoid virtual calls.
/// With hundreds of dynamic paths these virtual calls are noticeable.
std::unordered_map<String, ColumnDynamic *> dynamic_paths_ptrs;
PathToDynamicColumnPtrMap dynamic_paths_ptrs;
/// Shared storage for all other paths and values. It's filled
/// when the number of dynamic paths reaches the limit.
/// It has type Array(Tuple(String, String)) and stores

View File

@ -0,0 +1,25 @@
#pragma once
#include <base/StringRef.h>
namespace DB
{
/// See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0919r3.html
struct StringHashForHeterogeneousLookup
{
using hash_type = std::hash<std::string_view>;
using transparent_key_equal = std::equal_to<>;
using is_transparent = void; // required to make find() work with different type than key_type
auto operator()(const std::string_view view) const
{
return hash_type()(view);
}
auto operator()(const std::string & str) const
{
return hash_type()(str);
}
};
}

View File

@ -139,7 +139,7 @@ private:
{
/// Collect all dynamic paths.
const auto & dynamic_path_columns = column_object.getDynamicPaths();
std::vector<String> dynamic_paths;
std::vector<std::string_view> dynamic_paths;
dynamic_paths.reserve(dynamic_path_columns.size());
for (const auto & [path, _] : dynamic_path_columns)
dynamic_paths.push_back(path);
@ -149,11 +149,11 @@ private:
size_t size = column_object.size();
for (size_t i = 0; i != size; ++i)
{
for (const auto & path : dynamic_paths)
for (const auto path : dynamic_paths)
{
/// Don't include path if it contains NULL, because we consider
/// it to be equivalent to the absence of this path in this row.
if (!dynamic_path_columns.at(path)->isNullAt(i))
if (!dynamic_path_columns.find(path)->second->isNullAt(i))
data.insertData(path.data(), path.size());
}
offsets.push_back(data.size());
@ -162,7 +162,7 @@ private:
}
/// Collect all paths: typed, dynamic and paths from shared data.
std::vector<String> sorted_dynamic_and_typed_paths;
std::vector<std::string_view> sorted_dynamic_and_typed_paths;
const auto & typed_path_columns = column_object.getTypedPaths();
const auto & dynamic_path_columns = column_object.getDynamicPaths();
sorted_dynamic_and_typed_paths.reserve(typed_path_columns.size() + dynamic_path_columns.size());
@ -184,22 +184,22 @@ private:
size_t sorted_paths_index = 0;
for (size_t j = start; j != end; ++j)
{
auto shared_data_path = shared_data_paths->getDataAt(j);
auto shared_data_path = shared_data_paths->getDataAt(j).toView();
while (sorted_paths_index != sorted_dynamic_and_typed_paths.size() && sorted_dynamic_and_typed_paths[sorted_paths_index] < shared_data_path)
{
const auto & path = sorted_dynamic_and_typed_paths[sorted_paths_index];
const auto path = sorted_dynamic_and_typed_paths[sorted_paths_index];
/// If it's dynamic path include it only if it's not NULL.
if (auto it = dynamic_path_columns.find(path); it == dynamic_path_columns.end() || !it->second->isNullAt(i))
data.insertData(path.data(), path.size());
++sorted_paths_index;
}
data.insertData(shared_data_path.data, shared_data_path.size);
data.insertData(shared_data_path.data(), shared_data_path.size());
}
for (; sorted_paths_index != sorted_dynamic_and_typed_paths.size(); ++sorted_paths_index)
{
const auto & path = sorted_dynamic_and_typed_paths[sorted_paths_index];
const auto path = sorted_dynamic_and_typed_paths[sorted_paths_index];
if (auto it = dynamic_path_columns.find(path); it == dynamic_path_columns.end() || !it->second->isNullAt(i))
data.insertData(path.data(), path.size());
}
@ -220,7 +220,7 @@ private:
if constexpr (Impl::paths_mode == PathsMode::DYNAMIC_PATHS)
{
const auto & dynamic_path_columns = column_object.getDynamicPaths();
std::vector<String> sorted_dynamic_paths;
std::vector<std::string_view> sorted_dynamic_paths;
sorted_dynamic_paths.reserve(dynamic_path_columns.size());
for (const auto & [path, _] : dynamic_path_columns)
sorted_dynamic_paths.push_back(path);
@ -230,9 +230,9 @@ private:
/// Iterate over all rows and extract types from dynamic columns.
for (size_t i = 0; i != column_object.size(); ++i)
{
for (auto & path : sorted_dynamic_paths)
for (const auto path : sorted_dynamic_paths)
{
auto column = dynamic_path_columns.at(path);
const auto & column = dynamic_path_columns.find(path)->second;
if (!column->isNullAt(i))
{
auto type = getDynamicValueType(column, i);
@ -272,7 +272,7 @@ private:
}
/// Iterate over all rows and extract types from dynamic columns from dynamic paths and from values in shared data.
std::vector<std::pair<String, String>> sorted_typed_and_dynamic_paths_with_types;
std::vector<std::pair<std::string_view, String>> sorted_typed_and_dynamic_paths_with_types;
const auto & typed_path_types = type_object.getTypedPaths();
const auto & dynamic_path_columns = column_object.getDynamicPaths();
sorted_typed_and_dynamic_paths_with_types.reserve(typed_path_types.size() + dynamic_path_columns.size());
@ -294,7 +294,7 @@ private:
size_t sorted_paths_index = 0;
for (size_t j = start; j != end; ++j)
{
auto shared_data_path = shared_data_paths->getDataAt(j);
auto shared_data_path = shared_data_paths->getDataAt(j).toView();
auto type_name = getDynamicValueTypeFromSharedData(shared_data_values->getDataAt(j));
/// Skip NULL values.
if (!type_name)
@ -319,7 +319,7 @@ private:
++sorted_paths_index;
}
paths_column->insertData(shared_data_path.data, shared_data_path.size);
paths_column->insertData(shared_data_path.data(), shared_data_path.size());
types_column->insertData(type_name->data(), type_name->size());
}