From c4440ad82955d2badb54b4ed1c5050fcc8b4ac7a Mon Sep 17 00:00:00 2001 From: Andrey Mironov Date: Mon, 13 Jul 2015 19:18:28 +0300 Subject: [PATCH] dbms: RangeHashedDictionary implementation [#METR-17111] --- .../Dictionaries/ClickHouseDictionarySource.h | 8 + .../DB/Dictionaries/DictionaryStructure.h | 6 +- .../DB/Dictionaries/MySQLDictionarySource.h | 8 + .../DB/Dictionaries/RangeHashedDictionary.h | 434 ++++++++++++++++++ .../DB/Functions/FunctionsDictionaries.h | 17 +- dbms/src/Interpreters/DictionaryFactory.cpp | 53 ++- 6 files changed, 504 insertions(+), 22 deletions(-) create mode 100644 dbms/include/DB/Dictionaries/RangeHashedDictionary.h diff --git a/dbms/include/DB/Dictionaries/ClickHouseDictionarySource.h b/dbms/include/DB/Dictionaries/ClickHouseDictionarySource.h index e57248de428..37daae6a018 100644 --- a/dbms/include/DB/Dictionaries/ClickHouseDictionarySource.h +++ b/dbms/include/DB/Dictionaries/ClickHouseDictionarySource.h @@ -95,6 +95,14 @@ private: writeProbablyBackQuotedString(dict_struct.id_name, out); + if (!dict_struct.range_min.empty() && !dict_struct.range_max.empty()) + { + writeString(", ", out); + writeProbablyBackQuotedString(dict_struct.range_min, out); + writeString(", ", out); + writeProbablyBackQuotedString(dict_struct.range_max, out); + } + for (const auto & attr : dict_struct.attributes) { writeString(", ", out); diff --git a/dbms/include/DB/Dictionaries/DictionaryStructure.h b/dbms/include/DB/Dictionaries/DictionaryStructure.h index ca35fe933ba..1723043a005 100644 --- a/dbms/include/DB/Dictionaries/DictionaryStructure.h +++ b/dbms/include/DB/Dictionaries/DictionaryStructure.h @@ -117,9 +117,13 @@ struct DictionaryStructure final { std::string id_name; std::vector attributes; + std::string range_min; + std::string range_max; DictionaryStructure(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) - : id_name{config.getString(config_prefix + ".id.name")} + : id_name{config.getString(config_prefix + ".id.name")}, + range_min{config.getString(config_prefix + ".range_min.name", "")}, + range_max{config.getString(config_prefix + ".range_max.name", "")} { if (id_name.empty()) throw Exception{ diff --git a/dbms/include/DB/Dictionaries/MySQLDictionarySource.h b/dbms/include/DB/Dictionaries/MySQLDictionarySource.h index e5f83232e13..c0ee3974d49 100644 --- a/dbms/include/DB/Dictionaries/MySQLDictionarySource.h +++ b/dbms/include/DB/Dictionaries/MySQLDictionarySource.h @@ -106,6 +106,14 @@ private: writeProbablyBackQuotedString(dict_struct.id_name, out); + if (!dict_struct.range_min.empty() && !dict_struct.range_max.empty()) + { + writeString(", ", out); + writeProbablyBackQuotedString(dict_struct.range_min, out); + writeString(", ", out); + writeProbablyBackQuotedString(dict_struct.range_max, out); + } + for (const auto & attr : dict_struct.attributes) { writeString(", ", out); diff --git a/dbms/include/DB/Dictionaries/RangeHashedDictionary.h b/dbms/include/DB/Dictionaries/RangeHashedDictionary.h new file mode 100644 index 00000000000..cc40cd0fb59 --- /dev/null +++ b/dbms/include/DB/Dictionaries/RangeHashedDictionary.h @@ -0,0 +1,434 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +class RangeHashedDictionary final : public IDictionary +{ +public: + RangeHashedDictionary(const std::string & name, const DictionaryStructure & dict_struct, + DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime) + : name{name}, dict_struct(dict_struct), + source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime) + { + createAttributes(); + + try + { + loadData(); + calculateBytesAllocated(); + } + catch (...) + { + creation_exception = std::current_exception(); + } + + creation_time = std::chrono::system_clock::now(); + } + + RangeHashedDictionary(const RangeHashedDictionary & other) + : RangeHashedDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime} + {} + + std::exception_ptr getCreationException() const override { return creation_exception; } + + std::string getName() const override { return name; } + + std::string getTypeName() const override { return "RangeHashed"; } + + std::size_t getBytesAllocated() const override { return bytes_allocated; } + + std::size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + + double getHitRate() const override { return 1.0; } + + std::size_t getElementCount() const override { return element_count; } + + double getLoadFactor() const override { return static_cast(element_count) / bucket_count; } + + bool isCached() const override { return false; } + + DictionaryPtr clone() const override { return std::make_unique(*this); } + + const IDictionarySource * getSource() const override { return source_ptr.get(); } + + const DictionaryLifetime & getLifetime() const override { return dict_lifetime; } + + const DictionaryStructure & getStructure() const override { return dict_struct; } + + std::chrono::time_point getCreationTime() const override + { + return creation_time; + } + + bool isInjective(const std::string & attribute_name) const override + { + return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; + } + + bool hasHierarchy() const override { return hierarchical_attribute; } + + void toParent(const PODArray & ids, PODArray & out) const override + { + getItems(*hierarchical_attribute, ids, out); + } + +#define DECLARE_MULTIPLE_GETTER(TYPE)\ + void get##TYPE(const std::string & attribute_name, const PODArray & ids, PODArray & out) const override\ + {\ + const auto & attribute = getAttribute(attribute_name);\ + if (attribute.type != AttributeUnderlyingType::TYPE)\ + throw Exception{\ + "Type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\ + ErrorCodes::TYPE_MISMATCH\ + };\ + \ + getItems(attribute, ids, out);\ + } + DECLARE_MULTIPLE_GETTER(UInt8) + DECLARE_MULTIPLE_GETTER(UInt16) + DECLARE_MULTIPLE_GETTER(UInt32) + DECLARE_MULTIPLE_GETTER(UInt64) + DECLARE_MULTIPLE_GETTER(Int8) + DECLARE_MULTIPLE_GETTER(Int16) + DECLARE_MULTIPLE_GETTER(Int32) + DECLARE_MULTIPLE_GETTER(Int64) + DECLARE_MULTIPLE_GETTER(Float32) + DECLARE_MULTIPLE_GETTER(Float64) +#undef DECLARE_MULTIPLE_GETTER + void getString(const std::string & attribute_name, const PODArray & ids, ColumnString * out) const override + { + const auto & attribute = getAttribute(attribute_name); + if (attribute.type != AttributeUnderlyingType::String) + throw Exception{ + "Type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type), + ErrorCodes::TYPE_MISMATCH + }; + + const auto & attr = *std::get>(attribute.maps); + const auto & null_value = std::get(attribute.null_values); + + const UInt16 date = 10; + + for (const auto i : ext::range(0, ids.size())) + { + const auto it = attr.find(ids[i]); + if (it != std::end(attr)) + { + const auto & ranges_and_values = it->second; + const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values), + [date] (const value_t & v) { return v.range.contains(date); }); + + const auto string_ref = val_it != std::end(ranges_and_values) ? val_it->value : null_value; + out->insertData(string_ref.data, string_ref.size); + } + else + out->insertData(null_value.data(), null_value.size()); + } + + query_count.fetch_add(ids.size(), std::memory_order_relaxed); + } + +private: + struct range_t : std::pair + { + using std::pair::pair; + + bool contains(const UInt16 date) const + { + const auto & left = first; + const auto & right = second; + + if (left <= date && date <= right) + return true; + + const auto has_left_bound = 0 < left && left <= DATE_LUT_MAX_DAY_NUM; + const auto has_right_bound = 0 < right && right <= DATE_LUT_MAX_DAY_NUM; + + if ((!has_left_bound || left <= date) && (!has_right_bound || date <= right)) + return true; + + return false; + } + }; + + template + struct value_t final + { + range_t range; + T value; + }; + + template using values_t = std::vector>; + template using collection_t = HashMap>; + template using ptr_t = std::unique_ptr>; + + struct attribute_t final + { + public: + AttributeUnderlyingType type; + std::tuple null_values; + std::tuple, ptr_t, ptr_t, ptr_t, + ptr_t, ptr_t, ptr_t, ptr_t, + ptr_t, ptr_t, ptr_t> maps; + std::unique_ptr string_arena; + }; + + void createAttributes() + { + const auto size = dict_struct.attributes.size(); + attributes.reserve(size); + + for (const auto & attribute : dict_struct.attributes) + { + attribute_index_by_name.emplace(attribute.name, attributes.size()); + attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value)); + + if (attribute.hierarchical) + { + hierarchical_attribute = &attributes.back(); + + if (hierarchical_attribute->type != AttributeUnderlyingType::UInt64) + throw Exception{ + "Hierarchical attribute must be UInt64.", + ErrorCodes::TYPE_MISMATCH + }; + } + } + } + + void loadData() + { + auto stream = source_ptr->loadAll(); + stream->readPrefix(); + + while (const auto block = stream->read()) + { + const auto & id_column = *block.getByPosition(0).column; + const auto & min_range_column = *block.getByPosition(1).column; + const auto & max_range_column = *block.getByPosition(2).column; + + element_count += id_column.size(); + + for (const auto attribute_idx : ext::range(0, attributes.size())) + { + const auto & attribute_column = *block.getByPosition(attribute_idx + 3).column; + auto & attribute = attributes[attribute_idx]; + + for (const auto row_idx : ext::range(0, id_column.size())) + setAttributeValue(attribute, id_column[row_idx].get(), + range_t(min_range_column[row_idx].get(), max_range_column[row_idx].get()), + attribute_column[row_idx]); + } + } + + stream->readSuffix(); + } + + template + void addAttributeSize(const attribute_t & attribute) + { + const auto & map_ref = std::get>(attribute.maps); + bytes_allocated += sizeof(collection_t) + map_ref->getBufferSizeInBytes(); + bucket_count = map_ref->getBufferSizeInCells(); + } + + void calculateBytesAllocated() + { + bytes_allocated += attributes.size() * sizeof(attributes.front()); + + for (const auto & attribute : attributes) + { + switch (attribute.type) + { + case AttributeUnderlyingType::UInt8: addAttributeSize(attribute); break; + case AttributeUnderlyingType::UInt16: addAttributeSize(attribute); break; + case AttributeUnderlyingType::UInt32: addAttributeSize(attribute); break; + case AttributeUnderlyingType::UInt64: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int8: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int16: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int32: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Int64: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Float32: addAttributeSize(attribute); break; + case AttributeUnderlyingType::Float64: addAttributeSize(attribute); break; + case AttributeUnderlyingType::String: + { + addAttributeSize(attribute); + bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); + + break; + } + } + } + } + + template + void createAttributeImpl(attribute_t & attribute, const Field & null_value) + { + std::get(attribute.null_values) = null_value.get::Type>(); + std::get>(attribute.maps) = std::make_unique>(); + } + + attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) + { + attribute_t attr{type}; + + switch (type) + { + case AttributeUnderlyingType::UInt8: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::UInt16: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::UInt32: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::UInt64: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int8: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int16: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int32: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Int64: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Float32: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::Float64: createAttributeImpl(attr, null_value); break; + case AttributeUnderlyingType::String: + { + const auto & null_value_ref = std::get(attr.null_values) = null_value.get(); + std::get>(attr.maps) = std::make_unique>(); + attr.string_arena = std::make_unique(); + break; + } + } + + return attr; + } + + template + void getItems(const attribute_t & attribute, const PODArray & ids, PODArray & out) const + { + const auto & attr = *std::get>(attribute.maps); + const auto null_value = std::get(attribute.null_values); + + const UInt16 date = 10; + + for (const auto i : ext::range(0, ids.size())) + { + const auto it = attr.find(ids[i]); + if (it != std::end(attr)) + { + const auto & ranges_and_values = it->second; + const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values), + [date] (const value_t & v) { return v.range.contains(date); }); + + out[i] = val_it != std::end(ranges_and_values) ? val_it->value : null_value; + } + else + out[i] = null_value; + } + + query_count.fetch_add(ids.size(), std::memory_order_relaxed); + } + + template + void setAttributeValueImpl(attribute_t & attribute, const id_t id, const range_t & range, const T value) + { + auto & map = *std::get>(attribute.maps); + const auto it = map.find(id); + + if (it != map.end()) + { + auto & values = it->second; + + const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range, + [] (const value_t & lhs, const range_t & range) { + return lhs.range < range; + }); + + values.insert(insert_it, value_t{ range, value }); + } + else + map.insert({ id, values_t{ value_t{ range, value } } }); + } + + void setAttributeValue(attribute_t & attribute, const id_t id, const range_t & range, const Field & value) + { + switch (attribute.type) + { + case AttributeUnderlyingType::UInt8: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::UInt16: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::UInt32: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::UInt64: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::Int8: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::Int16: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::Int32: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::Int64: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::Float32: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::Float64: setAttributeValueImpl(attribute, id, range, value.get()); break; + case AttributeUnderlyingType::String: + { + auto & map = *std::get>(attribute.maps); + const auto & string = value.get(); + const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size()); + const StringRef string_ref{string_in_arena, string.size()}; + + const auto it = map.find(id); + + if (it != map.end()) + { + auto & values = it->second; + + const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range, + [] (const value_t & lhs, const range_t & range) { + return lhs.range < range; + }); + + values.insert(insert_it, value_t{ range, string_ref }); + } + else + map.insert({ id, values_t{ value_t{ range, string_ref } } }); + + break; + } + } + } + + const attribute_t & getAttribute(const std::string & attribute_name) const + { + const auto it = attribute_index_by_name.find(attribute_name); + if (it == std::end(attribute_index_by_name)) + throw Exception{ + "No such attribute '" + attribute_name + "'", + ErrorCodes::BAD_ARGUMENTS + }; + + return attributes[it->second]; + } + + const std::string name; + const DictionaryStructure dict_struct; + const DictionarySourcePtr source_ptr; + const DictionaryLifetime dict_lifetime; + + std::map attribute_index_by_name; + std::vector attributes; + const attribute_t * hierarchical_attribute = nullptr; + + std::size_t bytes_allocated = 0; + std::size_t element_count = 0; + std::size_t bucket_count = 0; + mutable std::atomic query_count{}; + + std::chrono::time_point creation_time; + + std::exception_ptr creation_exception; +}; + +} diff --git a/dbms/include/DB/Functions/FunctionsDictionaries.h b/dbms/include/DB/Functions/FunctionsDictionaries.h index d7361e23d54..1f2df888b2a 100644 --- a/dbms/include/DB/Functions/FunctionsDictionaries.h +++ b/dbms/include/DB/Functions/FunctionsDictionaries.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -806,7 +807,8 @@ private: if (!executeDispatch(block, arguments, result, dict_ptr) && !executeDispatch(block, arguments, result, dict_ptr) && - !executeDispatch(block, arguments, result, dict_ptr)) + !executeDispatch(block, arguments, result, dict_ptr) && + !executeDispatch(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), ErrorCodes::UNKNOWN_TYPE @@ -839,7 +841,7 @@ private: } else if (const auto id_col = typeid_cast *>(id_col_untyped)) { - const PODArray ids{1, id_col->getData()}; + const PODArray ids(1, id_col->getData()); auto out = std::make_unique(); dictionary->getString(attr_name, ids, out.get()); @@ -956,7 +958,8 @@ private: if (!executeDispatch(block, arguments, result, dict_ptr) && !executeDispatch(block, arguments, result, dict_ptr) && - !executeDispatch(block, arguments, result, dict_ptr)) + !executeDispatch(block, arguments, result, dict_ptr) && + !executeDispatch(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), ErrorCodes::UNKNOWN_TYPE @@ -995,7 +998,7 @@ private: } else if (const auto id_col = typeid_cast *>(id_col_untyped)) { - const PODArray ids{1, id_col->getData()}; + const PODArray ids(1, id_col->getData()); PODArray data(1); DictGetTraits::get(dictionary, attr_name, ids, data); @@ -1098,7 +1101,8 @@ private: if (!executeDispatch(block, arguments, result, dict_ptr) && !executeDispatch(block, arguments, result, dict_ptr) && - !executeDispatch(block, arguments, result, dict_ptr)) + !executeDispatch(block, arguments, result, dict_ptr) && + !executeDispatch(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), ErrorCodes::UNKNOWN_TYPE @@ -1287,7 +1291,8 @@ private: if (!executeDispatch(block, arguments, result, dict_ptr) && !executeDispatch(block, arguments, result, dict_ptr) && - !executeDispatch(block, arguments, result, dict_ptr)) + !executeDispatch(block, arguments, result, dict_ptr) && + !executeDispatch(block, arguments, result, dict_ptr)) throw Exception{ "Unsupported dictionary type " + dict_ptr->getTypeName(), ErrorCodes::UNKNOWN_TYPE diff --git a/dbms/src/Interpreters/DictionaryFactory.cpp b/dbms/src/Interpreters/DictionaryFactory.cpp index 38d973da08f..23a434c6eef 100644 --- a/dbms/src/Interpreters/DictionaryFactory.cpp +++ b/dbms/src/Interpreters/DictionaryFactory.cpp @@ -3,13 +3,15 @@ #include #include #include +#include #include #include -#include + namespace DB { + DictionaryPtr DictionaryFactory::create(const std::string & name, Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Context & context) const { @@ -31,24 +33,44 @@ DictionaryPtr DictionaryFactory::create(const std::string & name, Poco::Util::Ab const auto & layout_type = keys.front(); - if ("flat" == layout_type) + if ("range_hashed" == layout_type) { - return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime); - } - else if ("hashed" == layout_type) - { - return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime); - } - else if ("cache" == layout_type) - { - const auto size = config.getInt(layout_prefix + ".cache.size_in_cells"); - if (size == 0) + if (dict_struct.range_min.empty() || dict_struct.range_min.empty()) throw Exception{ - "Dictionary of type 'cache' cannot have 0 cells", - ErrorCodes::TOO_SMALL_BUFFER_SIZE + "Dictionary of layout 'range_hashed' requires .structure.range_min and .structure.range_max", + ErrorCodes::BAD_ARGUMENTS }; - return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, size); + return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime); + } + else + { + if (!dict_struct.range_min.empty() || !dict_struct.range_min.empty()) + throw Exception{ + "Elements .structure.range_min and .structure.range_max should be defined only " + "for a dictionary of layout 'range_hashed'", + ErrorCodes::BAD_ARGUMENTS + }; + + if ("flat" == layout_type) + { + return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime); + } + else if ("hashed" == layout_type) + { + return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime); + } + else if ("cache" == layout_type) + { + const auto size = config.getInt(layout_prefix + ".cache.size_in_cells"); + if (size == 0) + throw Exception{ + "Dictionary of layout 'cache' cannot have 0 cells", + ErrorCodes::TOO_SMALL_BUFFER_SIZE + }; + + return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, size); + } } throw Exception{ @@ -57,4 +79,5 @@ DictionaryPtr DictionaryFactory::create(const std::string & name, Poco::Util::Ab }; }; + }