Updated dictGetDescendants, dictGetChildren implementation

This commit is contained in:
Maksim Kita 2021-03-25 16:23:19 +03:00
parent 566541528d
commit 720e2e0501
6 changed files with 327 additions and 180 deletions

View File

@ -218,6 +218,8 @@ ColumnUInt8::Ptr FlatDictionary::isInHierarchy(
auto result = ColumnUInt8::create();
result->getData() = std::move(is_in_hierarchy_result);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
return result;
}
@ -231,44 +233,22 @@ ColumnPtr FlatDictionary::getDescendants(
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const UInt64 null_value = std::get<UInt64>(hierarchical_attribute.null_values);
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.arrays);
HashMap<UInt64, UInt64> parent_to_child;
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
for (size_t i = 0; i < parent_keys.size(); ++i)
{
auto parent_key = parent_keys[i];
parent_to_child[parent_key] = static_cast<UInt64>(i);
if (loaded_ids[i])
parent_to_child[parent_key].emplace_back(static_cast<UInt64>(i));
}
auto result = getDescendantsArray(keys, parent_to_child, level);
std::cerr << "FlatDictionary::getDescendants " << parent_to_child.size() << std::endl;
for (auto & node : parent_to_child)
{
std::cerr << node.getKey() << " " << node.getMapped() << std::endl;
}
std::cerr << std::endl;
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
auto is_key_valid_func = [&](auto & key)
{
return parent_to_child.find(key) != nullptr;
};
auto get_child_key_func = [&](auto & key)
{
std::optional<UInt64> result;
auto it = parent_to_child.find(key);
if (it)
result = it->getMapped();
return result;
};
auto result = getDescendantsArray(keys, null_value, level, is_key_valid_func, get_child_key_func);
return result;
}

View File

@ -287,6 +287,34 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::isInHierarchy(
return nullptr;
}
template <DictionaryKeyType dictionary_key_type, bool sparse>
ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getDescendants(
ColumnPtr key_column,
const DataTypePtr &,
size_t level) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const CollectionType<UInt64> & parent_keys = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
for (const auto & [key, value] : parent_keys)
parent_to_child[value].emplace_back(key);
auto result = getDescendantsArray(keys, parent_to_child, level);
return result;
}
else
return nullptr;
}
template <DictionaryKeyType dictionary_key_type, bool sparse>
void HashedDictionary<dictionary_key_type, sparse>::createAttributes()
{

View File

@ -102,6 +102,11 @@ public:
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
ColumnPtr getDescendants(
ColumnPtr key_column,
const DataTypePtr & key_type,
size_t level) const override;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:

View File

@ -1,7 +1,5 @@
#include "HierarchyDictionariesUtils.h"
#include <Common/HashTable/HashSet.h>
namespace DB
{

View File

@ -3,6 +3,7 @@
#include <common/types.h>
#include <Common/PODArray.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
#include <Columns/IColumn.h>
#include <Columns/ColumnVector.h>
@ -18,116 +19,282 @@ namespace ErrorCodes
extern const int UNSUPPORTED_METHOD;
}
template <typename KeyType>
struct ElementsAndOffsets
namespace detail
{
PaddedPODArray<KeyType> elements;
PaddedPODArray<IColumn::Offset> offsets;
};
template <typename T>
struct IsKeyValidFuncInterface
{
bool operator()(T key [[maybe_unused]]) { return false; }
};
template <typename T>
struct GetKeyFuncInterface
{
std::optional<T> operator()(T key [[maybe_unused]]) { return {}; }
};
template <typename KeyType, typename IsKeyValidFunc, typename GetKeyFunc>
ElementsAndOffsets<KeyType> getKeysHierarchy(
const PaddedPODArray<KeyType> & hierarchy_keys,
const KeyType & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetKeyFunc && get_key_func)
{
size_t hierarchy_keys_size = hierarchy_keys.size();
PaddedPODArray<KeyType> elements;
elements.reserve(hierarchy_keys_size);
PaddedPODArray<IColumn::Offset> offsets;
offsets.reserve(hierarchy_keys_size);
struct OffsetInArray
template <typename KeyType>
struct ElementsAndOffsets
{
size_t offset_index;
size_t array_element_offset;
PaddedPODArray<KeyType> elements;
PaddedPODArray<IColumn::Offset> offsets;
};
HashMap<KeyType, OffsetInArray> already_processes_keys_to_offset;
already_processes_keys_to_offset.reserve(hierarchy_keys_size);
for (size_t i = 0; i < hierarchy_keys_size; ++i)
template <typename T>
struct IsKeyValidFuncInterface
{
auto hierarchy_key = hierarchy_keys[i];
size_t current_hierarchy_depth = 0;
bool operator()(T key [[maybe_unused]]) { return false; }
};
bool is_key_valid = std::forward<IsKeyValidFunc>(is_key_valid_func)(hierarchy_key);
template <typename T>
struct GetParentKeyFuncInterface
{
std::optional<T> operator()(T key [[maybe_unused]]) { return {}; }
};
if (!is_key_valid)
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
ElementsAndOffsets<KeyType> getKeysHierarchy(
const PaddedPODArray<KeyType> & hierarchy_keys,
const KeyType & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_key_func)
{
size_t hierarchy_keys_size = hierarchy_keys.size();
PaddedPODArray<KeyType> elements;
elements.reserve(hierarchy_keys_size);
PaddedPODArray<IColumn::Offset> offsets;
offsets.reserve(hierarchy_keys_size);
struct OffsetInArray
{
offsets.emplace_back(elements.size());
continue;
}
size_t offset_index;
size_t array_element_offset;
};
while (true)
HashMap<KeyType, OffsetInArray> already_processes_keys_to_offset;
already_processes_keys_to_offset.reserve(hierarchy_keys_size);
for (size_t i = 0; i < hierarchy_keys_size; ++i)
{
const auto * it = already_processes_keys_to_offset.find(hierarchy_key);
auto hierarchy_key = hierarchy_keys[i];
size_t current_hierarchy_depth = 0;
if (it)
bool is_key_valid = std::forward<IsKeyValidFunc>(is_key_valid_func)(hierarchy_key);
if (!is_key_valid)
{
const auto & index = it->getMapped();
size_t offset = index.offset_index;
bool is_loop = (offset == offsets.size());
if (unlikely(is_loop))
break;
size_t array_element_offset = index.array_element_offset;
size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0;
size_t start_index = previous_offset_size + array_element_offset;
size_t end_index = offsets[offset];
current_hierarchy_depth += end_index - start_index;
/// TODO: Insert part of pod array into itself
while (start_index < end_index)
{
elements.emplace_back(elements[start_index]);
++start_index;
}
break;
offsets.emplace_back(elements.size());
continue;
}
if (hierarchy_key == hierarchy_null_value || current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
break;
while (true)
{
const auto * it = already_processes_keys_to_offset.find(hierarchy_key);
already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth};
elements.emplace_back(hierarchy_key);
++current_hierarchy_depth;
if (it)
{
const auto & index = it->getMapped();
std::optional<KeyType> parent_key = std::forward<GetKeyFunc>(get_key_func)(hierarchy_key);
size_t offset = index.offset_index;
if (!parent_key.has_value())
break;
bool is_loop = (offset == offsets.size());
hierarchy_key = *parent_key;
if (unlikely(is_loop))
break;
size_t array_element_offset = index.array_element_offset;
size_t previous_offset_size = offset > 0 ? offsets[offset - 1] : 0;
size_t start_index = previous_offset_size + array_element_offset;
size_t end_index = offsets[offset];
current_hierarchy_depth += end_index - start_index;
/// TODO: Insert part of pod array into itself
while (start_index < end_index)
{
elements.emplace_back(elements[start_index]);
++start_index;
}
break;
}
if (hierarchy_key == hierarchy_null_value || current_hierarchy_depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
break;
already_processes_keys_to_offset[hierarchy_key] = {offsets.size(), current_hierarchy_depth};
elements.emplace_back(hierarchy_key);
++current_hierarchy_depth;
std::optional<KeyType> parent_key = std::forward<GetParentKeyFunc>(get_key_func)(hierarchy_key);
if (!parent_key.has_value())
break;
hierarchy_key = *parent_key;
}
offsets.emplace_back(elements.size());
}
offsets.emplace_back(elements.size());
ElementsAndOffsets<KeyType> result = {std::move(elements), std::move(offsets)};
return result;
}
ElementsAndOffsets<KeyType> result = {std::move(elements), std::move(offsets)};
struct GetAllDescendantsStrategy { size_t level = 0; };
struct GetDescendantsAtSpecificLevelStrategy { size_t level = 0; };
return result;
template <typename KeyType, typename Strategy>
ElementsAndOffsets<KeyType> getDescendants(
const PaddedPODArray<KeyType> & requested_keys,
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
Strategy strategy)
{
PaddedPODArray<KeyType> descendants;
descendants.reserve(requested_keys.size());
PaddedPODArray<size_t> descendants_offsets;
descendants_offsets.reserve(requested_keys.size());
struct Range
{
size_t start_index;
size_t end_index;
};
static constexpr Int64 key_range_requires_update = -1;
HashMap<KeyType, Range> already_processed_keys_to_range [[maybe_unused]];
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
already_processed_keys_to_range.reserve(requested_keys.size());
struct KeyAndDepth
{
KeyType key;
Int64 depth;
};
HashSet<KeyType> already_processed_keys_during_loop;
already_processed_keys_during_loop.reserve(requested_keys.size());
PaddedPODArray<KeyAndDepth> next_keys_to_process_stack;
next_keys_to_process_stack.reserve(requested_keys.size());
Int64 level = static_cast<Int64>(strategy.level);
for (size_t i = 0; i < requested_keys.size(); ++i)
{
const KeyType & requested_key = requested_keys[i];
if (parent_to_child.find(requested_key) == nullptr)
{
descendants_offsets.emplace_back(descendants.size());
continue;
}
next_keys_to_process_stack.emplace_back(KeyAndDepth{requested_key, 0});
while (!next_keys_to_process_stack.empty())
{
KeyAndDepth key_to_process = next_keys_to_process_stack.back();
KeyType key = key_to_process.key;
Int64 depth = key_to_process.depth;
next_keys_to_process_stack.pop_back();
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
{
if (depth == key_range_requires_update)
{
auto * it = already_processed_keys_to_range.find(key);
assert(it);
auto & range_to_update = it->getMapped();
range_to_update.end_index = descendants.size();
continue;
}
}
if (unlikely(already_processed_keys_during_loop.find(key) != nullptr))
{
next_keys_to_process_stack.clear();
break;
}
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
{
const auto * already_processed_it = already_processed_keys_to_range.find(key);
if (already_processed_it)
{
Range range = already_processed_it->getMapped();
if (unlikely(range.start_index > range.end_index))
{
/// Broken range because there was loop
already_processed_keys_to_range.erase(key);
}
else
{
/// TODO: Insert part of pod array
while (range.start_index != range.end_index)
{
descendants.emplace_back(descendants[range.start_index]);
++range.start_index;
}
continue;
}
}
}
const auto * it = parent_to_child.find(key);
if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
continue;
if constexpr (std::is_same_v<Strategy, GetDescendantsAtSpecificLevelStrategy>)
{
if (depth > level)
continue;
}
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
{
size_t range_start_index = descendants.size();
already_processed_keys_to_range[key].start_index = range_start_index;
next_keys_to_process_stack.emplace_back(KeyAndDepth{key, -1});
}
already_processed_keys_during_loop.insert(key);
++depth;
const auto & childs = it->getMapped();
for (auto child_key : childs)
{
if (std::is_same_v<Strategy, GetAllDescendantsStrategy> || depth == level)
descendants.emplace_back(child_key);
next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth});
}
}
already_processed_keys_during_loop.clear();
descendants_offsets.emplace_back(descendants.size());
}
ElementsAndOffsets<KeyType> result = {std::move(descendants), std::move(descendants_offsets)};
return result;
}
template<typename KeyType>
ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets<KeyType> && elements_and_offsets)
{
auto elements_column = ColumnVector<KeyType>::create();
elements_column->getData() = std::move(elements_and_offsets.elements);
auto offsets_column = ColumnVector<IColumn::Offset>::create();
offsets_column->getData() = std::move(elements_and_offsets.offsets);
auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
return column_array;
}
}
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
@ -137,16 +304,8 @@ ColumnPtr getKeysHierarchyArray(
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_parent_func)
{
auto elements_and_offsets = getKeysHierarchy(hierarchy_keys, hierarchy_null_value, std::forward<IsKeyValidFunc>(is_key_valid_func), std::forward<GetParentKeyFunc>(get_parent_func));
auto elements_column = ColumnVector<KeyType>::create();
elements_column->getData() = std::move(elements_and_offsets.elements);
auto offsets_column = ColumnVector<IColumn::Offset>::create();
offsets_column->getData() = std::move(elements_and_offsets.offsets);
auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
return column_array;
auto elements_and_offsets = detail::getKeysHierarchy(hierarchy_keys, hierarchy_null_value, std::forward<IsKeyValidFunc>(is_key_valid_func), std::forward<GetParentKeyFunc>(get_parent_func));
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
@ -162,7 +321,7 @@ PaddedPODArray<UInt8> isInKeysHierarchy(
PaddedPODArray<UInt8> result;
result.resize_fill(hierarchy_keys.size());
ElementsAndOffsets<KeyType> hierarchy = getKeysHierarchy(
detail::ElementsAndOffsets<KeyType> hierarchy = detail::getKeysHierarchy(
hierarchy_keys,
hierarchy_null_value,
std::forward<IsKeyValidFunc>(is_key_valid_func),
@ -190,64 +349,41 @@ PaddedPODArray<UInt8> isInKeysHierarchy(
return result;
}
template <typename KeyType, typename IsKeyValidFunc, typename GetDescendantKeyFunc>
template <typename KeyType>
ColumnPtr getDescendantsArray(
const PaddedPODArray<KeyType> & hierarchy_keys,
const KeyType & hierarchy_null_value,
size_t level,
IsKeyValidFunc && is_key_valid_func,
GetDescendantKeyFunc && get_descendant_func)
const PaddedPODArray<KeyType> & requested_keys,
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
size_t level)
{
auto elements_and_offsets = getKeysHierarchy(
hierarchy_keys,
hierarchy_null_value,
std::forward<IsKeyValidFunc>(is_key_valid_func),
std::forward<GetDescendantKeyFunc>(get_descendant_func));
auto & elements = elements_and_offsets.elements;
auto & offsets = elements_and_offsets.offsets;
std::cerr << "getDescendantsArray" << std::endl;
std::cerr << "Elements " << elements.size() << std::endl;
for (auto element : elements)
std::cerr << element << " ";
std::cerr << std::endl;
std::cerr << "Offsets " << offsets.size() << std::endl;
for (auto offset : offsets)
std::cerr << offset << " ";
std::cerr << std::endl;
PaddedPODArray<KeyType> descendants;
descendants.reserve(elements.size());
PaddedPODArray<size_t> descendants_offsets;
descendants_offsets.reserve(elements.size());
for (size_t i = 0; i < offsets.size(); ++i)
for (auto & node : parent_to_child)
{
size_t offset_start_index = i > 0 ? offsets[i - 1] : 0;
size_t offset_end_index = offsets[i];
size_t size = offset_end_index - offset_start_index;
if (level == 0)
descendants.insert(elements.begin() + offset_start_index + 1, elements.begin() + offset_end_index);
else if (level < size)
descendants.emplace_back(elements[offset_start_index + level]);
descendants_offsets.emplace_back(descendants.size());
const auto & key = node.getKey();
const auto & childs = node.getMapped();
std::cerr << "Key " << key << " childs " << childs.size() << std::endl;
for (auto child : childs)
std::cerr << child << " ";
std::cerr << std::endl;
}
auto elements_column = ColumnVector<KeyType>::create();
elements_column->getData() = std::move(descendants);
auto offsets_column = ColumnVector<IColumn::Offset>::create();
offsets_column->getData() = std::move(descendants_offsets);
auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
return column_array;
if (level == 0)
{
detail::GetAllDescendantsStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
else
{
detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
}
ColumnPtr getHierarchyDefaultImplementation(const IDictionary * dictionary, ColumnPtr key_column, const DataTypePtr & key_type);
ColumnPtr getHierarchyDefaultImplementation(
const IDictionary * dictionary,
ColumnPtr key_column,
const DataTypePtr & key_type);
ColumnUInt8::Ptr isInHierarchyDefaultImplementation(
const IDictionary * dictionary,

View File

@ -852,7 +852,7 @@ private:
if (!dict->hasHierarchy())
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Dictionary ({}) does not support hierarchy", dict->getFullName());
ColumnPtr res = dict->getDescendants(arguments[1].column, std::make_shared<DataTypeUInt64>(), 0);
ColumnPtr res = dict->getDescendants(arguments[1].column, std::make_shared<DataTypeUInt64>(), 1);
return res;
}
@ -895,7 +895,7 @@ private:
throw Exception{"Illegal type " + arguments[1]->getName() + " of third argument of function " + getName()
+ ", must be const unsigned integer.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
return std::make_shared<DataTypeUInt8>();
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
}
bool isDeterministic() const override { return false; }