Merge branch 'METR-21665'

This commit is contained in:
Pavel Kartavyy 2016-06-08 17:49:33 +03:00
commit f992afa8bb
38 changed files with 4341 additions and 3182 deletions

View File

@ -124,3 +124,4 @@ add_subdirectory (contrib)
add_subdirectory (libs) add_subdirectory (libs)
add_subdirectory (utils) add_subdirectory (utils)
add_subdirectory (dbms) add_subdirectory (dbms)
add_subdirectory (private)

View File

@ -163,7 +163,7 @@ public:
protected: protected:
int run(); int run();
void waitForTerminationRequest(); virtual void waitForTerminationRequest();
#if !defined(_WIN32_WCE) #if !defined(_WIN32_WCE)
void defineOptions(OptionSet& options); void defineOptions(OptionSet& options);
#endif #endif

View File

@ -756,6 +756,13 @@ add_library (dbms
src/Dictionaries/MySQLDictionarySource.cpp src/Dictionaries/MySQLDictionarySource.cpp
src/Dictionaries/ODBCDictionarySource.cpp src/Dictionaries/ODBCDictionarySource.cpp
src/Dictionaries/writeParenthesisedString.cpp src/Dictionaries/writeParenthesisedString.cpp
src/Dictionaries/DictionaryStructure.cpp
src/Dictionaries/FlatDictionary.cpp
src/Dictionaries/HashedDictionary.cpp
src/Dictionaries/CacheDictionary.cpp
src/Dictionaries/RangeHashedDictionary.cpp
src/Dictionaries/ComplexKeyHashedDictionary.cpp
src/Dictionaries/ComplexKeyCacheDictionary.cpp
src/Parsers/ASTSelectQuery.cpp src/Parsers/ASTSelectQuery.cpp
src/Parsers/ASTAlterQuery.cpp src/Parsers/ASTAlterQuery.cpp
@ -921,6 +928,12 @@ SET_SOURCE_FILES_PROPERTIES(
src/Functions/FunctionsMath.cpp src/Functions/FunctionsMath.cpp
src/Functions/FunctionsMiscellaneous.cpp src/Functions/FunctionsMiscellaneous.cpp
src/Functions/FunctionsTransform.cpp src/Functions/FunctionsTransform.cpp
src/Dictionaries/FlatDictionary.cpp
src/Dictionaries/HashedDictionary.cpp
src/Dictionaries/CacheDictionary.cpp
src/Dictionaries/RangeHashedDictionary.cpp
src/Dictionaries/ComplexKeyHashedDictionary.cpp
src/Dictionaries/ComplexKeyCacheDictionary.cpp
PROPERTIES COMPILE_FLAGS -g0) PROPERTIES COMPILE_FLAGS -g0)
IF (NOT AARCH64) IF (NOT AARCH64)

View File

@ -24,37 +24,14 @@
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int BAD_ARGUMENTS;
extern const int UNSUPPORTED_METHOD;
}
class CacheDictionary final : public IDictionary class CacheDictionary final : public IDictionary
{ {
public: public:
CacheDictionary(const std::string & name, const DictionaryStructure & dict_struct, CacheDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime,
const std::size_t size) const std::size_t size);
: name{name}, dict_struct(dict_struct),
source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
size{round_up_to_power_of_two(size)},
cells{this->size}
{
if (!this->source_ptr->supportsSelectiveLoad())
throw Exception{
name + ": source cannot be used with CacheDictionary",
ErrorCodes::UNSUPPORTED_METHOD
};
createAttributes(); CacheDictionary(const CacheDictionary & other);
}
CacheDictionary(const CacheDictionary & other)
: CacheDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.size}
{}
std::exception_ptr getCreationException() const override { return {}; } std::exception_ptr getCreationException() const override { return {}; }
@ -101,27 +78,10 @@ public:
bool hasHierarchy() const override { return hierarchical_attribute; } bool hasHierarchy() const override { return hierarchical_attribute; }
void toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const override void toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const override;
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
getItems<UInt64>(*hierarchical_attribute, ids, out, [&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const\ void get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const;
{\
auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItems<TYPE>(attribute, ids, out, [&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -133,34 +93,13 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const
{
auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
const auto null_value = StringRef{std::get<String>(attribute.null_values)}; void getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const;
getItems(attribute, ids, out, [&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\ const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\
PaddedPODArray<TYPE> & out) const\ PaddedPODArray<TYPE> & out) const;
{\
auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, ids, out, [&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -172,33 +111,14 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def, const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def,
ColumnString * const out) const ColumnString * const out) const;
{
auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems(attribute, ids, out, [&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE def, PaddedPODArray<TYPE> & out) const\ const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE def, PaddedPODArray<TYPE> & out) const;
{\
auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, ids, out, [&] (const std::size_t) { return def; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -210,67 +130,12 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def, const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def,
ColumnString * const out) const ColumnString * const out) const;
{
auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems(attribute, ids, out, [&] (const std::size_t) { return StringRef{def}; }); void has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const override;
}
void has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const override
{
/// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
MapType<std::vector<std::size_t>> outdated_ids;
const auto rows = ext::size(ids);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. ids do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.id != id || cell.expiresAt() < now)
outdated_ids[id].push_back(row);
else
out[row] = !cell.isDefault();
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
if (outdated_ids.empty())
return;
std::vector<id_t> required_ids(outdated_ids.size());
std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
[] (auto & pair) { return pair.first; });
/// request new values
update(required_ids, [&] (const auto id, const auto) {
for (const auto row : outdated_ids[id])
out[row] = true;
}, [&] (const auto id, const auto) {
for (const auto row : outdated_ids[id])
out[row] = false;
});
}
private: private:
template <typename Value> using MapType = HashMap<id_t, Value>; template <typename Value> using MapType = HashMap<id_t, Value>;
@ -313,475 +178,44 @@ private:
ContainerPtrType<StringRef>> arrays; ContainerPtrType<StringRef>> arrays;
}; };
void createAttributes() void createAttributes();
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
bytes_allocated += size * sizeof(cell_metadata_t); attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
bytes_allocated += size * sizeof(attributes.front());
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical) template <typename OutputType, typename DefaultGetter>
{ void getItemsNumber(
hierarchical_attribute = &attributes.back(); attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
PaddedPODArray<OutputType> & out,
DefaultGetter && get_default) const;
if (hierarchical_attribute->type != AttributeUnderlyingType::UInt64) template <typename AttributeType, typename OutputType, typename DefaultGetter>
throw Exception{ void getItemsNumberImpl(
name + ": hierarchical attribute must be UInt64.", attribute_t & attribute,
ErrorCodes::TYPE_MISMATCH const PaddedPODArray<id_t> & ids,
}; PaddedPODArray<OutputType> & out,
} DefaultGetter && get_default) const;
}
}
attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8:
std::get<UInt8>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt8>>(attr.arrays) = std::make_unique<ContainerType<UInt8>>(size);
bytes_allocated += size * sizeof(UInt8);
break;
case AttributeUnderlyingType::UInt16:
std::get<UInt16>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt16>>(attr.arrays) = std::make_unique<ContainerType<UInt16>>(size);
bytes_allocated += size * sizeof(UInt16);
break;
case AttributeUnderlyingType::UInt32:
std::get<UInt32>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt32>>(attr.arrays) = std::make_unique<ContainerType<UInt32>>(size);
bytes_allocated += size * sizeof(UInt32);
break;
case AttributeUnderlyingType::UInt64:
std::get<UInt64>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt64>>(attr.arrays) = std::make_unique<ContainerType<UInt64>>(size);
bytes_allocated += size * sizeof(UInt64);
break;
case AttributeUnderlyingType::Int8:
std::get<Int8>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int8>>(attr.arrays) = std::make_unique<ContainerType<Int8>>(size);
bytes_allocated += size * sizeof(Int8);
break;
case AttributeUnderlyingType::Int16:
std::get<Int16>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int16>>(attr.arrays) = std::make_unique<ContainerType<Int16>>(size);
bytes_allocated += size * sizeof(Int16);
break;
case AttributeUnderlyingType::Int32:
std::get<Int32>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int32>>(attr.arrays) = std::make_unique<ContainerType<Int32>>(size);
bytes_allocated += size * sizeof(Int32);
break;
case AttributeUnderlyingType::Int64:
std::get<Int64>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int64>>(attr.arrays) = std::make_unique<ContainerType<Int64>>(size);
bytes_allocated += size * sizeof(Int64);
break;
case AttributeUnderlyingType::Float32:
std::get<Float32>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float32>>(attr.arrays) = std::make_unique<ContainerType<Float32>>(size);
bytes_allocated += size * sizeof(Float32);
break;
case AttributeUnderlyingType::Float64:
std::get<Float64>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float64>>(attr.arrays) = std::make_unique<ContainerType<Float64>>(size);
bytes_allocated += size * sizeof(Float64);
break;
case AttributeUnderlyingType::String:
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.arrays) = std::make_unique<ContainerType<StringRef>>(size);
bytes_allocated += size * sizeof(StringRef);
if (!string_arena)
string_arena = std::make_unique<ArenaWithFreeLists>();
break;
}
return attr;
}
template <typename T, typename DefaultGetter>
void getItems(
attribute_t & attribute, const PaddedPODArray<id_t> & ids, PaddedPODArray<T> & out, DefaultGetter && get_default) const
{
/// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
MapType<std::vector<std::size_t>> outdated_ids;
auto & attribute_array = std::get<ContainerPtrType<T>>(attribute.arrays);
const auto rows = ext::size(ids);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. ids do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.id != id || cell.expiresAt() < now)
outdated_ids[id].push_back(row);
else
out[row] = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
if (outdated_ids.empty())
return;
std::vector<id_t> required_ids(outdated_ids.size());
std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
[] (auto & pair) { return pair.first; });
/// request new values
update(required_ids, [&] (const auto id, const auto cell_idx) {
const auto attribute_value = attribute_array[cell_idx];
for (const auto row : outdated_ids[id])
out[row] = attribute_value;
}, [&] (const auto id, const auto cell_idx) {
for (const auto row : outdated_ids[id])
out[row] = get_default(row);
});
}
template <typename DefaultGetter> template <typename DefaultGetter>
void getItems( void getItemsString(
attribute_t & attribute, const PaddedPODArray<id_t> & ids, ColumnString * out, DefaultGetter && get_default) const attribute_t & attribute,
{ const PaddedPODArray<id_t> & ids,
const auto rows = ext::size(ids); ColumnString * out,
DefaultGetter && get_default) const;
/// save on some allocations
out->getOffsets().reserve(rows);
auto & attribute_array = std::get<ContainerPtrType<StringRef>>(attribute.arrays);
auto found_outdated_values = false;
/// perform optimistic version, fallback to pessimistic if failed
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, discard on fail
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
if (cell.id != id || cell.expiresAt() < now)
{
found_outdated_values = true;
break;
}
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
out->insertData(string_ref.data, string_ref.size);
}
}
}
/// optimistic code completed successfully
if (!found_outdated_values)
{
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows, std::memory_order_release);
return;
}
/// now onto the pessimistic one, discard possible partial results from the optimistic path
out->getChars().resize_assume_reserved(0);
out->getOffsets().resize_assume_reserved(0);
/// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
MapType<std::vector<std::size_t>> outdated_ids;
/// we are going to store every string separately
MapType<String> map;
std::size_t total_length = 0;
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
for (const auto row : ext::range(0, ids.size()))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
if (cell.id != id || cell.expiresAt() < now)
outdated_ids[id].push_back(row);
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
if (!cell.isDefault())
map[id] = String{string_ref};
total_length += string_ref.size + 1;
}
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
/// request new values
if (!outdated_ids.empty())
{
std::vector<id_t> required_ids(outdated_ids.size());
std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
[] (auto & pair) { return pair.first; });
update(required_ids, [&] (const auto id, const auto cell_idx) {
const auto attribute_value = attribute_array[cell_idx];
map[id] = String{attribute_value};
total_length += (attribute_value.size + 1) * outdated_ids[id].size();
}, [&] (const auto id, const auto cell_idx) {
for (const auto row : outdated_ids[id])
total_length += get_default(row).size + 1;
});
}
out->getChars().reserve(total_length);
for (const auto row : ext::range(0, ext::size(ids)))
{
const auto id = ids[row];
const auto it = map.find(id);
const auto string_ref = it != std::end(map) ? StringRef{it->second} : get_default(row);
out->insertData(string_ref.data, string_ref.size);
}
}
template <typename PresentIdHandler, typename AbsentIdHandler> template <typename PresentIdHandler, typename AbsentIdHandler>
void update( void update(
const std::vector<id_t> & requested_ids, PresentIdHandler && on_cell_updated, const std::vector<id_t> & requested_ids, PresentIdHandler && on_cell_updated,
AbsentIdHandler && on_id_not_found) const AbsentIdHandler && on_id_not_found) const;
{
MapType<UInt8> remaining_ids{requested_ids.size()};
for (const auto id : requested_ids)
remaining_ids.insert({ id, 0 });
std::uniform_int_distribution<std::uint64_t> distribution{ std::uint64_t getCellIdx(const id_t id) const;
dict_lifetime.min_sec,
dict_lifetime.max_sec
};
const Poco::ScopedWriteRWLock write_lock{rw_lock}; void setDefaultAttributeValue(attribute_t & attribute, const id_t idx) const;
auto stream = source_ptr->loadIds(requested_ids); void setAttributeValue(attribute_t & attribute, const id_t idx, const Field & value) const;
stream->readPrefix();
while (const auto block = stream->read()) attribute_t & getAttribute(const std::string & attribute_name) const;
{
const auto id_column = typeid_cast<const ColumnUInt64 *>(block.getByPosition(0).column.get());
if (!id_column)
throw Exception{
name + ": id column has type different from UInt64.",
ErrorCodes::TYPE_MISMATCH
};
const auto & ids = id_column->getData();
/// cache column pointers
const auto column_ptrs = ext::map<std::vector>(ext::range(0, attributes.size()), [&block] (const auto & i) {
return block.getByPosition(i + 1).column.get();
});
for (const auto i : ext::range(0, ids.size()))
{
const auto id = ids[i];
const auto cell_idx = getCellIdx(id);
auto & cell = cells[cell_idx];
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *column_ptrs[attribute_idx];
auto & attribute = attributes[attribute_idx];
setAttributeValue(attribute, cell_idx, attribute_column[i]);
}
/// if cell id is zero and zero does not map to this cell, then the cell is unused
if (cell.id == 0 && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
cell.id = id;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
/// inform caller
on_cell_updated(id, cell_idx);
/// mark corresponding id as found
remaining_ids[id] = 1;
}
}
stream->readSuffix();
/// Check which ids have not been found and require setting null_value
for (const auto id_found_pair : remaining_ids)
{
if (id_found_pair.second)
continue;
const auto id = id_found_pair.first;
const auto cell_idx = getCellIdx(id);
auto & cell = cells[cell_idx];
/// Set null_value for each attribute
for (auto & attribute : attributes)
setDefaultAttributeValue(attribute, cell_idx);
/// Check if cell had not been occupied before and increment element counter if it hadn't
if (cell.id == 0 && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
cell.id = id;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
cell.setDefault();
/// inform caller that the cell has not been found
on_id_not_found(id, cell_idx);
}
}
std::uint64_t getCellIdx(const id_t id) const
{
const auto hash = intHash64(id);
const auto idx = hash & (size - 1);
return idx;
}
void setDefaultAttributeValue(attribute_t & attribute, const id_t idx) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = std::get<UInt8>(attribute.null_values); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = std::get<UInt16>(attribute.null_values); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = std::get<UInt32>(attribute.null_values); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = std::get<UInt64>(attribute.null_values); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = std::get<Int8>(attribute.null_values); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = std::get<Int16>(attribute.null_values); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = std::get<Int32>(attribute.null_values); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = std::get<Int64>(attribute.null_values); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = std::get<Float32>(attribute.null_values); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = std::get<Float64>(attribute.null_values); break;
case AttributeUnderlyingType::String:
{
const auto & null_value_ref = std::get<String>(attribute.null_values);
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
if (string_ref.data != null_value_ref.data())
{
if (string_ref.data)
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
string_ref = StringRef{null_value_ref};
}
break;
}
}
}
void setAttributeValue(attribute_t & attribute, const id_t idx, const Field & value) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::String:
{
const auto & string = value.get<String>();
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
const auto & null_value_ref = std::get<String>(attribute.null_values);
/// free memory unless it points to a null_value
if (string_ref.data && string_ref.data != null_value_ref.data())
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
const auto size = string.size();
if (size != 0)
{
auto string_ptr = string_arena->alloc(size + 1);
std::copy(string.data(), string.data() + size + 1, string_ptr);
string_ref = StringRef{string_ptr, size};
}
else
string_ref = {};
break;
}
}
}
attribute_t & getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS
};
return attributes[it->second];
}
static std::size_t round_up_to_power_of_two(std::size_t n)
{
--n;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
++n;
return n;
}
static std::uint64_t getSeed()
{
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_nsec ^ getpid();
}
const std::string name; const std::string name;
const DictionaryStructure dict_struct; const DictionaryStructure dict_struct;
@ -797,7 +231,7 @@ private:
attribute_t * hierarchical_attribute = nullptr; attribute_t * hierarchical_attribute = nullptr;
std::unique_ptr<ArenaWithFreeLists> string_arena; std::unique_ptr<ArenaWithFreeLists> string_arena;
mutable std::mt19937_64 rnd_engine{getSeed()}; mutable std::mt19937_64 rnd_engine;
mutable std::size_t bytes_allocated = 0; mutable std::size_t bytes_allocated = 0;
mutable std::atomic<std::size_t> element_count{0}; mutable std::atomic<std::size_t> element_count{0};

View File

@ -26,35 +26,15 @@
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int BAD_ARGUMENTS;
extern const int UNSUPPORTED_METHOD;
}
class ComplexKeyCacheDictionary final : public IDictionaryBase class ComplexKeyCacheDictionary final : public IDictionaryBase
{ {
public: public:
ComplexKeyCacheDictionary(const std::string & name, const DictionaryStructure & dict_struct, ComplexKeyCacheDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime,
const std::size_t size) const std::size_t size);
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
size{round_up_to_power_of_two(size)}
{
if (!this->source_ptr->supportsSelectiveLoad())
throw Exception{
name + ": source cannot be used with ComplexKeyCacheDictionary",
ErrorCodes::UNSUPPORTED_METHOD
};
createAttributes(); ComplexKeyCacheDictionary(const ComplexKeyCacheDictionary & other);
}
ComplexKeyCacheDictionary(const ComplexKeyCacheDictionary & other)
: ComplexKeyCacheDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.size}
{}
std::string getKeyDescription() const { return key_description; }; std::string getKeyDescription() const { return key_description; };
@ -110,21 +90,7 @@ public:
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
PaddedPODArray<TYPE> & out) const\ PaddedPODArray<TYPE> & out) const;
{\
dict_struct.validateKeyTypes(key_types);\
\
auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItems<TYPE>(attribute, key_columns, out, [&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -136,40 +102,15 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
ColumnString * out) const ColumnString * out) const;
{
dict_struct.validateKeyTypes(key_types);
auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
const auto null_value = StringRef{std::get<String>(attribute.null_values)};
getItems(attribute, key_columns, out, [&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const\ const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const;
{\
dict_struct.validateKeyTypes(key_types);\
\
auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, key_columns, out, [&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -181,38 +122,15 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const ColumnString * const def, ColumnString * const out) const const ColumnString * const def, ColumnString * const out) const;
{
dict_struct.validateKeyTypes(key_types);
auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems(attribute, key_columns, out, [&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const TYPE def, PaddedPODArray<TYPE> & out) const\ const TYPE def, PaddedPODArray<TYPE> & out) const;
{\
dict_struct.validateKeyTypes(key_types);\
\
auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, key_columns, out, [&] (const std::size_t) { return def; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -224,78 +142,12 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const String & def, ColumnString * const out) const const String & def, ColumnString * const out) const;
{
dict_struct.validateKeyTypes(key_types);
auto & attribute = getAttribute(attribute_name); void has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const;
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems(attribute, key_columns, out, [&] (const std::size_t) { return StringRef{def}; });
}
void has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const
{
dict_struct.validateKeyTypes(key_types);
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<std::size_t>> outdated_keys;
const auto rows = key_columns.front()->size();
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
outdated_keys[key].push_back(row);
else
out[row] = !cell.isDefault();
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return;
std::vector<std::size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows),
[] (auto & pair) { return pair.second.front(); });
/// request new values
update(key_columns, keys_array, required_rows, [&] (const auto key, const auto) {
for (const auto out_idx : outdated_keys[key])
out[out_idx] = true;
}, [&] (const auto key, const auto) {
for (const auto out_idx : outdated_keys[key])
out[out_idx] = false;
});
}
private: private:
template <typename Value> using MapType = HashMapWithSavedHash<StringRef, Value, StringRefHash>; template <typename Value> using MapType = HashMapWithSavedHash<StringRef, Value, StringRefHash>;
@ -339,584 +191,55 @@ private:
ContainerPtrType<StringRef>> arrays; ContainerPtrType<StringRef>> arrays;
}; };
void createAttributes() void createAttributes();
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
bytes_allocated += size * sizeof(cell_metadata_t); attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
bytes_allocated += size * sizeof(attributes.front());
for (const auto & attribute : dict_struct.attributes) template <typename OutputType, typename DefaultGetter>
{ void getItemsNumber(
attribute_index_by_name.emplace(attribute.name, attributes.size()); attribute_t & attribute,
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value)); const ConstColumnPlainPtrs & key_columns,
PaddedPODArray<OutputType> & out,
DefaultGetter && get_default) const;
if (attribute.hierarchical) template <typename AttributeType, typename OutputType, typename DefaultGetter>
throw Exception{ void getItemsNumberImpl(
name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), attribute_t & attribute,
ErrorCodes::TYPE_MISMATCH const ConstColumnPlainPtrs & key_columns,
}; PaddedPODArray<OutputType> & out,
} DefaultGetter && get_default) const;
}
attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8:
std::get<UInt8>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt8>>(attr.arrays) = std::make_unique<ContainerType<UInt8>>(size);
bytes_allocated += size * sizeof(UInt8);
break;
case AttributeUnderlyingType::UInt16:
std::get<UInt16>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt16>>(attr.arrays) = std::make_unique<ContainerType<UInt16>>(size);
bytes_allocated += size * sizeof(UInt16);
break;
case AttributeUnderlyingType::UInt32:
std::get<UInt32>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt32>>(attr.arrays) = std::make_unique<ContainerType<UInt32>>(size);
bytes_allocated += size * sizeof(UInt32);
break;
case AttributeUnderlyingType::UInt64:
std::get<UInt64>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt64>>(attr.arrays) = std::make_unique<ContainerType<UInt64>>(size);
bytes_allocated += size * sizeof(UInt64);
break;
case AttributeUnderlyingType::Int8:
std::get<Int8>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int8>>(attr.arrays) = std::make_unique<ContainerType<Int8>>(size);
bytes_allocated += size * sizeof(Int8);
break;
case AttributeUnderlyingType::Int16:
std::get<Int16>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int16>>(attr.arrays) = std::make_unique<ContainerType<Int16>>(size);
bytes_allocated += size * sizeof(Int16);
break;
case AttributeUnderlyingType::Int32:
std::get<Int32>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int32>>(attr.arrays) = std::make_unique<ContainerType<Int32>>(size);
bytes_allocated += size * sizeof(Int32);
break;
case AttributeUnderlyingType::Int64:
std::get<Int64>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int64>>(attr.arrays) = std::make_unique<ContainerType<Int64>>(size);
bytes_allocated += size * sizeof(Int64);
break;
case AttributeUnderlyingType::Float32:
std::get<Float32>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float32>>(attr.arrays) = std::make_unique<ContainerType<Float32>>(size);
bytes_allocated += size * sizeof(Float32);
break;
case AttributeUnderlyingType::Float64:
std::get<Float64>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float64>>(attr.arrays) = std::make_unique<ContainerType<Float64>>(size);
bytes_allocated += size * sizeof(Float64);
break;
case AttributeUnderlyingType::String:
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.arrays) = std::make_unique<ContainerType<StringRef>>(size);
bytes_allocated += size * sizeof(StringRef);
if (!string_arena)
string_arena = std::make_unique<ArenaWithFreeLists>();
break;
}
return attr;
}
template <typename T, typename DefaultGetter>
void getItems(
attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray<T> & out,
DefaultGetter && get_default) const
{
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<std::size_t>> outdated_keys;
auto & attribute_array = std::get<ContainerPtrType<T>>(attribute.arrays);
const auto rows = key_columns.front()->size();
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
outdated_keys[key].push_back(row);
else
out[row] = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return;
std::vector<std::size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows),
[] (auto & pair) { return pair.second.front(); });
/// request new values
update(key_columns, keys_array, required_rows, [&] (const auto key, const auto cell_idx) {
for (const auto row : outdated_keys[key])
out[row] = attribute_array[cell_idx];
}, [&] (const auto key, const auto cell_idx) {
for (const auto row : outdated_keys[key])
out[row] = get_default(row);
});
}
template <typename DefaultGetter> template <typename DefaultGetter>
void getItems( void getItemsString(
attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, ColumnString * out, attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, ColumnString * out,
DefaultGetter && get_default) const DefaultGetter && get_default) const;
{
const auto rows = key_columns.front()->size();
/// save on some allocations
out->getOffsets().reserve(rows);
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
auto & attribute_array = std::get<ContainerPtrType<StringRef>>(attribute.arrays);
auto found_outdated_values = false;
/// perform optimistic version, fallback to pessimistic if failed
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, discard on fail
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
SCOPE_EXIT(temporary_keys_pool.rollback(key.size));
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
{
found_outdated_values = true;
break;
}
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
out->insertData(string_ref.data, string_ref.size);
}
}
}
/// optimistic code completed successfully
if (!found_outdated_values)
{
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows, std::memory_order_release);
return;
}
/// now onto the pessimistic one, discard possible partial results from the optimistic path
out->getChars().resize_assume_reserved(0);
out->getOffsets().resize_assume_reserved(0);
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<std::size_t>> outdated_keys;
/// we are going to store every string separately
MapType<String> map;
PODArray<StringRef> keys_array(rows);
std::size_t total_length = 0;
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
outdated_keys[key].push_back(row);
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
if (!cell.isDefault())
map[key] = String{string_ref};
total_length += string_ref.size + 1;
}
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
/// request new values
if (!outdated_keys.empty())
{
std::vector<std::size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows),
[] (auto & pair) { return pair.second.front(); });
update(key_columns, keys_array, required_rows, [&] (const auto key, const auto cell_idx) {
const auto attribute_value = attribute_array[cell_idx];
map[key] = String{attribute_value};
total_length += (attribute_value.size + 1) * outdated_keys[key].size();
}, [&] (const auto key, const auto cell_idx) {
for (const auto row : outdated_keys[key])
total_length += get_default(row).size + 1;
});
}
out->getChars().reserve(total_length);
for (const auto row : ext::range(0, ext::size(keys_array)))
{
const auto key = keys_array[row];
const auto it = map.find(key);
const auto string_ref = it != std::end(map) ? StringRef{it->second} : get_default(row);
out->insertData(string_ref.data, string_ref.size);
}
}
template <typename PresentKeyHandler, typename AbsentKeyHandler> template <typename PresentKeyHandler, typename AbsentKeyHandler>
void update( void update(
const ConstColumnPlainPtrs & in_key_columns, const PODArray<StringRef> & in_keys, const ConstColumnPlainPtrs & in_key_columns, const PODArray<StringRef> & in_keys,
const std::vector<std::size_t> & in_requested_rows, PresentKeyHandler && on_cell_updated, const std::vector<std::size_t> & in_requested_rows, PresentKeyHandler && on_cell_updated,
AbsentKeyHandler && on_key_not_found) const AbsentKeyHandler && on_key_not_found) const;
{
MapType<bool> remaining_keys{in_requested_rows.size()};
for (const auto row : in_requested_rows)
remaining_keys.insert({ in_keys[row], false });
std::uniform_int_distribution<std::uint64_t> distribution{ std::uint64_t getCellIdx(const StringRef key) const;
dict_lifetime.min_sec,
dict_lifetime.max_sec
};
const Poco::ScopedWriteRWLock write_lock{rw_lock}; void setDefaultAttributeValue(attribute_t & attribute, const std::size_t idx) const;
auto stream = source_ptr->loadKeys(in_key_columns, in_requested_rows); void setAttributeValue(attribute_t & attribute, const std::size_t idx, const Field & value) const;
stream->readPrefix();
const auto keys_size = dict_struct.key.value().size(); attribute_t & getAttribute(const std::string & attribute_name) const;
StringRefs keys(keys_size);
const auto attributes_size = attributes.size(); StringRef allocKey(const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys) const;
while (const auto block = stream->read()) void freeKey(const StringRef key) const;
{
/// cache column pointers
const auto key_columns = ext::map<ConstColumnPlainPtrs>(ext::range(0, keys_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(attribute_idx).column.get();
});
const auto attribute_columns = ext::map<ConstColumnPlainPtrs>(ext::range(0, attributes_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(keys_size + attribute_idx).column.get();
});
const auto rows = block.rowsInFirstColumn();
for (const auto row : ext::range(0, rows))
{
auto key = allocKey(row, key_columns, keys);
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
auto & cell = cells[cell_idx];
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *attribute_columns[attribute_idx];
auto & attribute = attributes[attribute_idx];
setAttributeValue(attribute, cell_idx, attribute_column[row]);
}
/// if cell id is zero and zero does not map to this cell, then the cell is unused
if (cell.key == StringRef{} && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
/// handle memory allocated for old key
if (key == cell.key)
{
freeKey(key);
key = cell.key;
}
else
{
/// new key is different from the old one
if (cell.key.data)
freeKey(cell.key);
cell.key = key;
}
cell.hash = hash;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
/// inform caller
on_cell_updated(key, cell_idx);
/// mark corresponding id as found
remaining_keys[key] = true;
}
}
stream->readSuffix();
/// Check which ids have not been found and require setting null_value
for (const auto key_found_pair : remaining_keys)
{
if (key_found_pair.second)
continue;
auto key = key_found_pair.first;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
auto & cell = cells[cell_idx];
/// Set null_value for each attribute
for (auto & attribute : attributes)
setDefaultAttributeValue(attribute, cell_idx);
/// Check if cell had not been occupied before and increment element counter if it hadn't
if (cell.key == StringRef{} && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
if (key == cell.key)
key = cell.key;
else
{
if (cell.key.data)
freeKey(cell.key);
/// copy key from temporary pool
key = copyKey(key);
cell.key = key;
}
cell.hash = hash;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
cell.setDefault();
/// inform caller that the cell has not been found
on_key_not_found(key, cell_idx);
}
}
std::uint64_t getCellIdx(const StringRef key) const
{
const auto hash = StringRefHash{}(key);
const auto idx = hash & (size - 1);
return idx;
}
void setDefaultAttributeValue(attribute_t & attribute, const std::size_t idx) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = std::get<UInt8>(attribute.null_values); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = std::get<UInt16>(attribute.null_values); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = std::get<UInt32>(attribute.null_values); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = std::get<UInt64>(attribute.null_values); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = std::get<Int8>(attribute.null_values); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = std::get<Int16>(attribute.null_values); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = std::get<Int32>(attribute.null_values); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = std::get<Int64>(attribute.null_values); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = std::get<Float32>(attribute.null_values); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = std::get<Float64>(attribute.null_values); break;
case AttributeUnderlyingType::String:
{
const auto & null_value_ref = std::get<String>(attribute.null_values);
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
if (string_ref.data != null_value_ref.data())
{
if (string_ref.data)
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
string_ref = StringRef{null_value_ref};
}
break;
}
}
}
void setAttributeValue(attribute_t & attribute, const std::size_t idx, const Field & value) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::String:
{
const auto & string = value.get<String>();
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
const auto & null_value_ref = std::get<String>(attribute.null_values);
/// free memory unless it points to a null_value
if (string_ref.data && string_ref.data != null_value_ref.data())
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
const auto size = string.size();
if (size != 0)
{
auto string_ptr = string_arena->alloc(size + 1);
std::copy(string.data(), string.data() + size + 1, string_ptr);
string_ref = StringRef{string_ptr, size};
}
else
string_ref = {};
break;
}
}
}
attribute_t & getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS
};
return attributes[it->second];
}
StringRef allocKey(const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys) const
{
if (key_size_is_fixed)
return placeKeysInFixedSizePool(row, key_columns);
return placeKeysInPool(row, key_columns, keys, *keys_pool);
}
void freeKey(const StringRef key) const
{
if (key_size_is_fixed)
fixed_size_keys_pool->free(const_cast<char *>(key.data));
else
keys_pool->free(const_cast<char *>(key.data), key.size);
}
static std::size_t round_up_to_power_of_two(std::size_t n)
{
--n;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
++n;
return n;
}
static std::uint64_t getSeed()
{
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_nsec ^ getpid();
}
template <typename Arena> template <typename Arena>
static StringRef placeKeysInPool( static StringRef placeKeysInPool(
const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys, Arena & pool) const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys, Arena & pool);
{
const auto keys_size = key_columns.size();
size_t sum_keys_size{};
for (const auto i : ext::range(0, keys_size))
{
keys[i] = key_columns[i]->getDataAtWithTerminatingZero(row);
sum_keys_size += keys[i].size;
}
const auto res = pool.alloc(sum_keys_size);
auto place = res;
for (size_t j = 0; j < keys_size; ++j)
{
memcpy(place, keys[j].data, keys[j].size);
place += keys[j].size;
}
return { res, sum_keys_size };
}
StringRef placeKeysInFixedSizePool( StringRef placeKeysInFixedSizePool(
const std::size_t row, const ConstColumnPlainPtrs & key_columns) const const std::size_t row, const ConstColumnPlainPtrs & key_columns) const;
{
const auto res = fixed_size_keys_pool->alloc();
auto place = res;
for (const auto & key_column : key_columns) StringRef copyKey(const StringRef key) const;
{
const auto key = key_column->getDataAt(row);
memcpy(place, key.data, key.size);
place += key.size;
}
return { res, key_size };
}
StringRef copyKey(const StringRef key) const
{
const auto res = key_size_is_fixed ? fixed_size_keys_pool->alloc() : keys_pool->alloc(key.size);
memcpy(res, key.data, key.size);
return { res, key.size };
}
const std::string name; const std::string name;
const DictionaryStructure dict_struct; const DictionaryStructure dict_struct;
@ -938,7 +261,7 @@ private:
std::make_unique<SmallObjectPool>(key_size) : nullptr; std::make_unique<SmallObjectPool>(key_size) : nullptr;
std::unique_ptr<ArenaWithFreeLists> string_arena; std::unique_ptr<ArenaWithFreeLists> string_arena;
mutable std::mt19937_64 rnd_engine{getSeed()}; mutable std::mt19937_64 rnd_engine;
mutable std::size_t bytes_allocated = 0; mutable std::size_t bytes_allocated = 0;
mutable std::atomic<std::size_t> element_count{0}; mutable std::atomic<std::size_t> element_count{0};

View File

@ -16,41 +16,14 @@
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
}
class ComplexKeyHashedDictionary final : public IDictionaryBase class ComplexKeyHashedDictionary final : public IDictionaryBase
{ {
public: public:
ComplexKeyHashedDictionary( ComplexKeyHashedDictionary(
const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr, const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr,
const DictionaryLifetime dict_lifetime, bool require_nonempty) const DictionaryLifetime dict_lifetime, bool require_nonempty);
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try ComplexKeyHashedDictionary(const ComplexKeyHashedDictionary & other);
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
ComplexKeyHashedDictionary(const ComplexKeyHashedDictionary & other)
: ComplexKeyHashedDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{}
std::string getKeyDescription() const { return key_description; }; std::string getKeyDescription() const { return key_description; };
@ -93,23 +66,7 @@ public:
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
PaddedPODArray<TYPE> & out) const\ PaddedPODArray<TYPE> & out) const;
{\
dict_struct.validateKeyTypes(key_types);\
\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItems<TYPE>(attribute, key_columns,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -121,44 +78,15 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
ColumnString * out) const ColumnString * out) const;
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
const auto & null_value = StringRef{std::get<String>(attribute.null_values)};
getItems<StringRef>(attribute, key_columns,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const\ const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const;
{\
dict_struct.validateKeyTypes(key_types);\
\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, key_columns,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -170,42 +98,15 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const ColumnString * const def, ColumnString * const out) const const ColumnString * const def, ColumnString * const out) const;
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems<StringRef>(attribute, key_columns,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\ const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const TYPE def, PaddedPODArray<TYPE> & out) const\ const TYPE def, PaddedPODArray<TYPE> & out) const;
{\
dict_struct.validateKeyTypes(key_types);\
\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, key_columns,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return def; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -217,45 +118,12 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const String & def, ColumnString * const out) const const String & def, ColumnString * const out) const;
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name); void has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const;
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems<StringRef>(attribute, key_columns,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return StringRef{def}; });
}
void has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = attributes.front();
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: has<UInt8>(attribute, key_columns, out); break;
case AttributeUnderlyingType::UInt16: has<UInt16>(attribute, key_columns, out); break;
case AttributeUnderlyingType::UInt32: has<UInt32>(attribute, key_columns, out); break;
case AttributeUnderlyingType::UInt64: has<UInt64>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int8: has<Int8>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int16: has<Int16>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int32: has<Int32>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int64: has<Int64>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Float32: has<Float32>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Float64: has<Float64>(attribute, key_columns, out); break;
case AttributeUnderlyingType::String: has<StringRef>(attribute, key_columns, out); break;
}
}
private: private:
template <typename Value> using ContainerType = HashMapWithSavedHash<StringRef, Value, StringRefHash>; template <typename Value> using ContainerType = HashMapWithSavedHash<StringRef, Value, StringRefHash>;
@ -277,276 +145,48 @@ private:
std::unique_ptr<Arena> string_arena; std::unique_ptr<Arena> string_arena;
}; };
void createAttributes() void createAttributes();
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes) void loadData();
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
throw Exception{
name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
ErrorCodes::TYPE_MISMATCH
};
}
}
void loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
/// created upfront to avoid excess allocations
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
while (const auto block = stream->read())
{
const auto rows = block.rowsInFirstColumn();
element_count += rows;
const auto key_column_ptrs = ext::map<ConstColumnPlainPtrs>(ext::range(0, keys_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(attribute_idx).column.get();
});
const auto attribute_column_ptrs = ext::map<ConstColumnPlainPtrs>(ext::range(0, attributes_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(keys_size + attribute_idx).column.get();
});
for (const auto row_idx : ext::range(0, rows))
{
/// calculate key once per row
const auto key = placeKeysInPool(row_idx, key_column_ptrs, keys, keys_pool);
auto should_rollback = false;
for (const auto attribute_idx : ext::range(0, attributes_size))
{
const auto & attribute_column = *attribute_column_ptrs[attribute_idx];
auto & attribute = attributes[attribute_idx];
const auto inserted = setAttributeValue(attribute, key, attribute_column[row_idx]);
if (!inserted)
should_rollback = true;
}
/// @note on multiple equal keys the mapped value for the first one is stored
if (should_rollback)
keys_pool.rollback(key.size);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY
};
}
template <typename T> template <typename T>
void addAttributeSize(const attribute_t & attribute) void addAttributeSize(const attribute_t & attribute);
{
const auto & map_ref = std::get<ContainerPtrType<T>>(attribute.maps);
bytes_allocated += sizeof(ContainerType<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
void calculateBytesAllocated() void calculateBytesAllocated();
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
bytes_allocated += keys_pool.size();
}
template <typename T> template <typename T>
void createAttributeImpl(attribute_t & attribute, const Field & null_value) void createAttributeImpl(attribute_t & attribute, const Field & null_value);
{
std::get<T>(attribute.null_values) = null_value.get<typename NearestFieldType<T>::Type>();
std::get<ContainerPtrType<T>>(attribute.maps) = std::make_unique<ContainerType<T>>();
}
attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break;
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break;
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break;
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.maps) = std::make_unique<ContainerType<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr; template <typename OutputType, typename ValueSetter, typename DefaultGetter>
} void getItemsNumber(
const attribute_t & attribute,
const ConstColumnPlainPtrs & key_columns,
ValueSetter && set_value,
DefaultGetter && get_default) const;
template <typename T, typename ValueSetter, typename DefaultGetter> template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void getItems( void getItemsImpl(
const attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, ValueSetter && set_value, const attribute_t & attribute,
DefaultGetter && get_default) const const ConstColumnPlainPtrs & key_columns,
{ ValueSetter && set_value,
const auto & attr = *std::get<ContainerPtrType<T>>(attribute.maps); DefaultGetter && get_default) const;
const auto keys_size = key_columns.size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
const auto rows = key_columns.front()->size();
for (const auto i : ext::range(0, rows))
{
/// copy key data to arena so it is contiguous and return StringRef to it
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
const auto it = attr.find(key);
set_value(i, it != attr.end() ? it->second : get_default(i));
/// free memory allocated for the key
temporary_keys_pool.rollback(key.size);
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T> template <typename T>
bool setAttributeValueImpl(attribute_t & attribute, const StringRef key, const T value) bool setAttributeValueImpl(attribute_t & attribute, const StringRef key, const T value);
{
auto & map = *std::get<ContainerPtrType<T>>(attribute.maps);
const auto pair = map.insert({ key, value });
return pair.second;
}
bool setAttributeValue(attribute_t & attribute, const StringRef key, const Field & value) bool setAttributeValue(attribute_t & attribute, const StringRef key, const Field & value);
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: return setAttributeValueImpl<UInt8>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::UInt16: return setAttributeValueImpl<UInt16>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::UInt32: return setAttributeValueImpl<UInt32>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::UInt64: return setAttributeValueImpl<UInt64>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::Int8: return setAttributeValueImpl<Int8>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Int16: return setAttributeValueImpl<Int16>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Int32: return setAttributeValueImpl<Int32>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Int64: return setAttributeValueImpl<Int64>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Float32: return setAttributeValueImpl<Float32>(attribute, key, value.get<Float64>());
case AttributeUnderlyingType::Float64: return setAttributeValueImpl<Float64>(attribute, key, value.get<Float64>());
case AttributeUnderlyingType::String:
{
auto & map = *std::get<ContainerPtrType<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
const auto pair = map.insert({ key, StringRef{string_in_arena, string.size()} });
return pair.second;
}
}
return {}; const attribute_t & getAttribute(const std::string & attribute_name) const;
}
const attribute_t & getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS
};
return attributes[it->second];
}
static StringRef placeKeysInPool( static StringRef placeKeysInPool(
const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys, Arena & pool) const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys, Arena & pool);
{
const auto keys_size = key_columns.size();
size_t sum_keys_size{};
for (const auto i : ext::range(0, keys_size))
{
keys[i] = key_columns[i]->getDataAtWithTerminatingZero(row);
sum_keys_size += keys[i].size;
}
const auto res = pool.alloc(sum_keys_size);
auto place = res;
for (size_t j = 0; j < keys_size; ++j)
{
memcpy(place, keys[j].data, keys[j].size);
place += keys[j].size;
}
return { res, sum_keys_size };
}
template <typename T> template <typename T>
void has(const attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray<UInt8> & out) const void has(const attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray<UInt8> & out) const;
{
const auto & attr = *std::get<ContainerPtrType<T>>(attribute.maps);
const auto keys_size = key_columns.size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
const auto rows = key_columns.front()->size();
for (const auto i : ext::range(0, rows))
{
/// copy key data to arena so it is contiguous and return StringRef to it
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
const auto it = attr.find(key);
out[i] = it != attr.end();
/// free memory allocated for the key
temporary_keys_pool.rollback(key.size);
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
const std::string name; const std::string name;
const DictionaryStructure dict_struct; const DictionaryStructure dict_struct;

View File

@ -16,14 +16,6 @@
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int UNKNOWN_TYPE;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int TYPE_MISMATCH;
}
enum class AttributeUnderlyingType enum class AttributeUnderlyingType
{ {
UInt8, UInt8,
@ -39,56 +31,15 @@ enum class AttributeUnderlyingType
String String
}; };
inline AttributeUnderlyingType getAttributeUnderlyingType(const std::string & type)
{
static const std::unordered_map<std::string, AttributeUnderlyingType> dictionary{
{ "UInt8", AttributeUnderlyingType::UInt8 },
{ "UInt16", AttributeUnderlyingType::UInt16 },
{ "UInt32", AttributeUnderlyingType::UInt32 },
{ "UInt64", AttributeUnderlyingType::UInt64 },
{ "Int8", AttributeUnderlyingType::Int8 },
{ "Int16", AttributeUnderlyingType::Int16 },
{ "Int32", AttributeUnderlyingType::Int32 },
{ "Int64", AttributeUnderlyingType::Int64 },
{ "Float32", AttributeUnderlyingType::Float32 },
{ "Float64", AttributeUnderlyingType::Float64 },
{ "String", AttributeUnderlyingType::String },
{ "Date", AttributeUnderlyingType::UInt16 },
{ "DateTime", AttributeUnderlyingType::UInt32 },
};
const auto it = dictionary.find(type); /** Для неявных преобразований в функциях dictGet.
if (it != std::end(dictionary)) */
return it->second; bool isAttributeTypeConvertibleTo(AttributeUnderlyingType from, AttributeUnderlyingType to);
throw Exception{ AttributeUnderlyingType getAttributeUnderlyingType(const std::string & type);
"Unknown type " + type,
ErrorCodes::UNKNOWN_TYPE
};
}
inline std::string toString(const AttributeUnderlyingType type) std::string toString(const AttributeUnderlyingType type);
{
switch (type)
{
case AttributeUnderlyingType::UInt8: return "UInt8";
case AttributeUnderlyingType::UInt16: return "UInt16";
case AttributeUnderlyingType::UInt32: return "UInt32";
case AttributeUnderlyingType::UInt64: return "UInt64";
case AttributeUnderlyingType::Int8: return "Int8";
case AttributeUnderlyingType::Int16: return "Int16";
case AttributeUnderlyingType::Int32: return "Int32";
case AttributeUnderlyingType::Int64: return "Int64";
case AttributeUnderlyingType::Float32: return "Float32";
case AttributeUnderlyingType::Float64: return "Float64";
case AttributeUnderlyingType::String: return "String";
}
throw Exception{
"Unknown attribute_type " + toString(static_cast<int>(type)),
ErrorCodes::ARGUMENT_OUT_OF_BOUND
};
}
/// Min and max lifetimes for a dictionary or it's entry /// Min and max lifetimes for a dictionary or it's entry
struct DictionaryLifetime final struct DictionaryLifetime final
@ -96,16 +47,10 @@ struct DictionaryLifetime final
std::uint64_t min_sec; std::uint64_t min_sec;
std::uint64_t max_sec; std::uint64_t max_sec;
DictionaryLifetime(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) DictionaryLifetime(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
{
const auto & lifetime_min_key = config_prefix + ".min";
const auto has_min = config.has(lifetime_min_key);
this->min_sec = has_min ? config.getInt(lifetime_min_key) : config.getInt(config_prefix);
this->max_sec = has_min ? config.getInt(config_prefix + ".max") : this->min_sec;
}
}; };
/** Holds the description of a single dictionary attribute: /** Holds the description of a single dictionary attribute:
* - name, used for lookup into dictionary and source; * - name, used for lookup into dictionary and source;
* - type, used in conjunction with DataTypeFactory and getAttributeUnderlyingTypeByname; * - type, used in conjunction with DataTypeFactory and getAttributeUnderlyingTypeByname;
@ -125,23 +70,16 @@ struct DictionaryAttribute final
const bool injective; const bool injective;
}; };
struct DictionarySpecialAttribute final struct DictionarySpecialAttribute final
{ {
const std::string name; const std::string name;
const std::string expression; const std::string expression;
DictionarySpecialAttribute(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) DictionarySpecialAttribute(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
: name{config.getString(config_prefix + ".name", "")},
expression{config.getString(config_prefix + ".expression", "")}
{
if (name.empty() && !expression.empty())
throw Exception{
"Element " + config_prefix + ".name is empty",
ErrorCodes::BAD_ARGUMENTS
};
}
}; };
/// Name of identifier plus list of attributes /// Name of identifier plus list of attributes
struct DictionaryStructure final struct DictionaryStructure final
{ {
@ -152,188 +90,17 @@ struct DictionaryStructure final
std::experimental::optional<DictionarySpecialAttribute> range_max; std::experimental::optional<DictionarySpecialAttribute> range_max;
bool has_expressions = false; bool has_expressions = false;
DictionaryStructure(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) DictionaryStructure(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
{
const auto has_id = config.has(config_prefix + ".id");
const auto has_key = config.has(config_prefix + ".key");
if (has_key && has_id) void validateKeyTypes(const DataTypes & key_types) const;
throw Exception{"Only one of 'id' and 'key' should be specified", ErrorCodes::BAD_ARGUMENTS}; std::string getKeyDescription() const;
bool isKeySizeFixed() const;
if (has_id) std::size_t getKeySize() const;
id.emplace(config, config_prefix + ".id");
else if (has_key)
{
key.emplace(getAttributes(config, config_prefix + ".key", false, false));
if (key->empty())
throw Exception{"Empty 'key' supplied", ErrorCodes::BAD_ARGUMENTS};
}
else
throw Exception{"Dictionary structure should specify either 'id' or 'key'", ErrorCodes::BAD_ARGUMENTS};
if (id)
{
if (id->name.empty())
throw Exception{"'id' cannot be empty", ErrorCodes::BAD_ARGUMENTS};
if (config.has(config_prefix + ".range_min"))
range_min.emplace(config, config_prefix + ".range_min");
if (config.has(config_prefix + ".range_max"))
range_max.emplace(config, config_prefix + ".range_max");
if (!id->expression.empty() ||
(range_min && !range_min->expression.empty()) ||
(range_max && !range_max->expression.empty()))
has_expressions = true;
}
attributes = getAttributes(config, config_prefix);
if (attributes.empty())
throw Exception{"Dictionary has no attributes defined", ErrorCodes::BAD_ARGUMENTS};
}
void validateKeyTypes(const DataTypes & key_types) const
{
if (key_types.size() != key.value().size())
throw Exception{
"Key structure does not match, expected " + getKeyDescription(),
ErrorCodes::TYPE_MISMATCH
};
for (const auto i : ext::range(0, key_types.size()))
{
const auto & expected_type = (*key)[i].type->getName();
const auto & actual_type = key_types[i]->getName();
if (expected_type != actual_type)
throw Exception{
"Key type at position " + std::to_string(i) + " does not match, expected " + expected_type +
", found " + actual_type,
ErrorCodes::TYPE_MISMATCH
};
}
}
std::string getKeyDescription() const
{
if (id)
return "UInt64";
std::ostringstream out;
out << '(';
auto first = true;
for (const auto & key_i : *key)
{
if (!first)
out << ", ";
first = false;
out << key_i.type->getName();
}
out << ')';
return out.str();
}
bool isKeySizeFixed() const
{
if (!key)
return true;
for (const auto key_i : * key)
if (key_i.underlying_type == AttributeUnderlyingType::String)
return false;
return true;
}
std::size_t getKeySize() const
{
return std::accumulate(std::begin(*key), std::end(*key), std::size_t{},
[] (const auto running_size, const auto & key_i) {return running_size + key_i.type->getSizeOfField(); });
}
private: private:
std::vector<DictionaryAttribute> getAttributes( std::vector<DictionaryAttribute> getAttributes(
const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix,
const bool hierarchy_allowed = true, const bool allow_null_values = true) const bool hierarchy_allowed = true, const bool allow_null_values = true);
{
Poco::Util::AbstractConfiguration::Keys keys;
config.keys(config_prefix, keys);
auto has_hierarchy = false;
std::vector<DictionaryAttribute> attributes;
for (const auto & key : keys)
{
if (0 != strncmp(key.data(), "attribute", strlen("attribute")))
continue;
const auto prefix = config_prefix + '.' + key + '.';
const auto name = config.getString(prefix + "name");
const auto type_string = config.getString(prefix + "type");
const auto type = DataTypeFactory::instance().get(type_string);
const auto underlying_type = getAttributeUnderlyingType(type_string);
const auto expression = config.getString(prefix + "expression", "");
if (!expression.empty())
has_expressions = true;
Field null_value;
if (allow_null_values)
{
const auto null_value_string = config.getString(prefix + "null_value");
try
{
ReadBufferFromString null_value_buffer{null_value_string};
ColumnPtr column_with_null_value = type->createColumn();
type->deserializeTextEscaped(*column_with_null_value, null_value_buffer);
null_value = (*column_with_null_value)[0];
}
catch (const std::exception & e)
{
throw Exception{
std::string{"Error parsing null_value: "} + e.what(),
ErrorCodes::BAD_ARGUMENTS
};
}
}
const auto hierarchical = config.getBool(prefix + "hierarchical", false);
const auto injective = config.getBool(prefix + "injective", false);
if (name.empty())
throw Exception{
"Properties 'name' and 'type' of an attribute cannot be empty",
ErrorCodes::BAD_ARGUMENTS
};
if (has_hierarchy && !hierarchy_allowed)
throw Exception{
"Hierarchy not allowed in '" + prefix,
ErrorCodes::BAD_ARGUMENTS
};
if (has_hierarchy && hierarchical)
throw Exception{
"Only one hierarchical attribute supported",
ErrorCodes::BAD_ARGUMENTS
};
has_hierarchy = has_hierarchy || hierarchical;
attributes.emplace_back(DictionaryAttribute{
name, underlying_type, type, expression, null_value, hierarchical, injective
});
}
return attributes;
}
}; };
} }

View File

@ -15,45 +15,13 @@
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
}
const auto initial_array_size = 1024;
const auto max_array_size = 500000;
class FlatDictionary final : public IDictionary class FlatDictionary final : public IDictionary
{ {
public: public:
FlatDictionary(const std::string & name, const DictionaryStructure & dict_struct, FlatDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, bool require_nonempty) DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, bool require_nonempty);
: name{name}, dict_struct(dict_struct),
source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try FlatDictionary(const FlatDictionary & other);
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
FlatDictionary(const FlatDictionary & other)
: FlatDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{}
std::exception_ptr getCreationException() const override { return creation_exception; } std::exception_ptr getCreationException() const override { return creation_exception; }
@ -93,31 +61,10 @@ public:
bool hasHierarchy() const override { return hierarchical_attribute; } bool hasHierarchy() const override { return hierarchical_attribute; }
void toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const override void toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const override;
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
getItems<UInt64>(*hierarchical_attribute, ids,
[&] (const std::size_t row, const UInt64 value) { out[row] = value; },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const\ void get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const;
{\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItems<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -129,38 +76,13 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
const auto & null_value = StringRef{std::get<String>(attribute.null_values)}; void getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const;
getItems<StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\ const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\
PaddedPODArray<TYPE> & out) const\ PaddedPODArray<TYPE> & out) const;
{\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -172,38 +94,15 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def, const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def,
ColumnString * const out) const ColumnString * const out) const;
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems<StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE def,\ const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE def,\
PaddedPODArray<TYPE> & out) const\ PaddedPODArray<TYPE> & out) const;
{\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return def; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -215,41 +114,12 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def, const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def,
ColumnString * const out) const ColumnString * const out) const;;
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems<StringRef>(attribute, ids, void has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const override;
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return StringRef{def}; });
}
void has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const override
{
const auto & attribute = attributes.front();
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: has<UInt8>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt16: has<UInt16>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt32: has<UInt32>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt64: has<UInt64>(attribute, ids, out); break;
case AttributeUnderlyingType::Int8: has<Int8>(attribute, ids, out); break;
case AttributeUnderlyingType::Int16: has<Int16>(attribute, ids, out); break;
case AttributeUnderlyingType::Int32: has<Int32>(attribute, ids, out); break;
case AttributeUnderlyingType::Int64: has<Int64>(attribute, ids, out); break;
case AttributeUnderlyingType::Float32: has<Float32>(attribute, ids, out); break;
case AttributeUnderlyingType::Float64: has<Float64>(attribute, ids, out); break;
case AttributeUnderlyingType::String: has<String>(attribute, ids, out); break;
}
}
private: private:
template <typename Value> using ContainerType = PaddedPODArray<Value>; template <typename Value> using ContainerType = PaddedPODArray<Value>;
@ -271,223 +141,42 @@ private:
std::unique_ptr<Arena> string_arena; std::unique_ptr<Arena> string_arena;
}; };
void createAttributes() void createAttributes();
{ void loadData();
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
{
hierarchical_attribute = &attributes.back();
if (hierarchical_attribute->type != AttributeUnderlyingType::UInt64)
throw Exception{
name + ": hierarchical attribute must be UInt64.",
ErrorCodes::TYPE_MISMATCH
};
}
}
}
void loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
while (const auto block = stream->read())
{
const auto & id_column = *block.getByPosition(0).column;
element_count += id_column.size();
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *block.getByPosition(attribute_idx + 1).column;
auto & attribute = attributes[attribute_idx];
for (const auto row_idx : ext::range(0, id_column.size()))
setAttributeValue(attribute, id_column[row_idx].get<UInt64>(), attribute_column[row_idx]);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY
};
}
template <typename T> template <typename T>
void addAttributeSize(const attribute_t & attribute) void addAttributeSize(const attribute_t & attribute);
{
const auto & array_ref = std::get<ContainerPtrType<T>>(attribute.arrays);
bytes_allocated += sizeof(PaddedPODArray<T>) + array_ref->allocated_size();
bucket_count = array_ref->capacity();
}
void calculateBytesAllocated() void calculateBytesAllocated();
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
}
template <typename T> template <typename T>
void createAttributeImpl(attribute_t & attribute, const Field & null_value) void createAttributeImpl(attribute_t & attribute, const Field & null_value);
{
const auto & null_value_ref = std::get<T>(attribute.null_values) =
null_value.get<typename NearestFieldType<T>::Type>();
std::get<ContainerPtrType<T>>(attribute.arrays) =
std::make_unique<ContainerType<T>>(initial_array_size, null_value_ref);
}
attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
{
attribute_t attr{type};
switch (type) template <typename OutputType, typename ValueSetter, typename DefaultGetter>
{ void getItemsNumber(
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break; const attribute_t & attribute,
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break; const PaddedPODArray<id_t> & ids,
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break; ValueSetter && set_value,
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break; DefaultGetter && get_default) const;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
const auto & null_value_ref = std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.arrays) =
std::make_unique<ContainerType<StringRef>>(initial_array_size, StringRef{null_value_ref});
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr; template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
} void getItemsImpl(
const attribute_t & attribute,
template <typename T, typename ValueSetter, typename DefaultGetter> const PaddedPODArray<id_t> & ids,
void getItems( ValueSetter && set_value,
const attribute_t & attribute, const PaddedPODArray<id_t> & ids, ValueSetter && set_value, DefaultGetter && get_default) const;
DefaultGetter && get_default) const
{
const auto & attr = *std::get<ContainerPtrType<T>>(attribute.arrays);
const auto rows = ext::size(ids);
using null_value_type = std::conditional_t<std::is_same<T, StringRef>::value, String, T>;
const auto null_value = std::get<null_value_type>(attribute.null_values);
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
set_value(row, id < ext::size(attr) && attr[id] != null_value ? attr[id] : get_default(row));
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T> template <typename T>
void setAttributeValueImpl(attribute_t & attribute, const id_t id, const T value) void setAttributeValueImpl(attribute_t & attribute, const id_t id, const T value);
{
auto & array = *std::get<ContainerPtrType<T>>(attribute.arrays);
if (id >= array.size())
array.resize_fill(id + 1, std::get<T>(attribute.null_values));
array[id] = value;
}
void setAttributeValue(attribute_t & attribute, const id_t id, const Field & value) void setAttributeValue(attribute_t & attribute, const id_t id, const Field & value);
{
if (id >= max_array_size)
throw Exception{
name + ": identifier should be less than " + toString(max_array_size),
ErrorCodes::ARGUMENT_OUT_OF_BOUND
};
switch (attribute.type) const attribute_t & getAttribute(const std::string & attribute_name) const;
{
case AttributeUnderlyingType::UInt8: setAttributeValueImpl<UInt8>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt16: setAttributeValueImpl<UInt16>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt32: setAttributeValueImpl<UInt32>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt64: setAttributeValueImpl<UInt64>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::Int8: setAttributeValueImpl<Int8>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int16: setAttributeValueImpl<Int16>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int32: setAttributeValueImpl<Int32>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int64: setAttributeValueImpl<Int64>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Float32: setAttributeValueImpl<Float32>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::Float64: setAttributeValueImpl<Float64>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::String:
{
auto & array = *std::get<ContainerPtrType<StringRef>>(attribute.arrays);
if (id >= array.size())
array.resize_fill(id + 1, StringRef{std::get<String>(attribute.null_values)});
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
array[id] = StringRef{string_in_arena, string.size()};
break;
}
}
}
const attribute_t & getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS
};
return attributes[it->second];
}
template <typename T> template <typename T>
void has(const attribute_t & attribute, const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const void has(const attribute_t & attribute, const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const;
{
using stored_type = std::conditional_t<std::is_same<T, String>::value, StringRef, T>;
const auto & attr = *std::get<ContainerPtrType<stored_type>>(attribute.arrays);
const auto & null_value = std::get<T>(attribute.null_values);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
{
const auto id = ids[i];
out[i] = id < ext::size(attr) && attr[id] != null_value;
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
const std::string name; const std::string name;
const DictionaryStructure dict_struct; const DictionaryStructure dict_struct;

View File

@ -14,41 +14,13 @@
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
}
class HashedDictionary final : public IDictionary class HashedDictionary final : public IDictionary
{ {
public: public:
HashedDictionary(const std::string & name, const DictionaryStructure & dict_struct, HashedDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, bool require_nonempty) DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, bool require_nonempty);
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try HashedDictionary(const HashedDictionary & other);
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
HashedDictionary(const HashedDictionary & other)
: HashedDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{}
std::exception_ptr getCreationException() const override { return creation_exception; } std::exception_ptr getCreationException() const override { return creation_exception; }
@ -88,31 +60,10 @@ public:
bool hasHierarchy() const override { return hierarchical_attribute; } bool hasHierarchy() const override { return hierarchical_attribute; }
void toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const override void toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const override;
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
getItems<UInt64>(*hierarchical_attribute, ids,
[&] (const std::size_t row, const UInt64 value) { out[row] = value; },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const\ void get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const;
{\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItems<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -124,38 +75,13 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
const auto & null_value = StringRef{std::get<String>(attribute.null_values)}; void getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const;
getItems<StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\ const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\
PaddedPODArray<TYPE> & out) const\ PaddedPODArray<TYPE> & out) const;
{\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -167,37 +93,14 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def, const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def,
ColumnString * const out) const ColumnString * const out) const;
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems<StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\ #define DECLARE(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE & def, PaddedPODArray<TYPE> & out) const\ const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE & def, PaddedPODArray<TYPE> & out) const;
{\
const auto & attribute = getAttribute(attribute_name);\
if (attribute.type != AttributeUnderlyingType::TYPE)\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH\
};\
\
getItems<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return def; });\
}
DECLARE(UInt8) DECLARE(UInt8)
DECLARE(UInt16) DECLARE(UInt16)
DECLARE(UInt32) DECLARE(UInt32)
@ -209,41 +112,12 @@ public:
DECLARE(Float32) DECLARE(Float32)
DECLARE(Float64) DECLARE(Float64)
#undef DECLARE #undef DECLARE
void getString( void getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def, const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def,
ColumnString * const out) const ColumnString * const out) const;
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != AttributeUnderlyingType::String)
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
getItems<StringRef>(attribute, ids, void has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const override;
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return StringRef{def}; });
}
void has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const override
{
const auto & attribute = attributes.front();
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: has<UInt8>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt16: has<UInt16>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt32: has<UInt32>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt64: has<UInt64>(attribute, ids, out); break;
case AttributeUnderlyingType::Int8: has<Int8>(attribute, ids, out); break;
case AttributeUnderlyingType::Int16: has<Int16>(attribute, ids, out); break;
case AttributeUnderlyingType::Int32: has<Int32>(attribute, ids, out); break;
case AttributeUnderlyingType::Int64: has<Int64>(attribute, ids, out); break;
case AttributeUnderlyingType::Float32: has<Float32>(attribute, ids, out); break;
case AttributeUnderlyingType::Float64: has<Float64>(attribute, ids, out); break;
case AttributeUnderlyingType::String: has<StringRef>(attribute, ids, out); break;
}
}
private: private:
template <typename Value> using CollectionType = HashMap<UInt64, Value>; template <typename Value> using CollectionType = HashMap<UInt64, Value>;
@ -265,203 +139,43 @@ private:
std::unique_ptr<Arena> string_arena; std::unique_ptr<Arena> string_arena;
}; };
void createAttributes() void createAttributes();
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes) void loadData();
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
{
hierarchical_attribute = &attributes.back();
if (hierarchical_attribute->type != AttributeUnderlyingType::UInt64)
throw Exception{
name + ": hierarchical attribute must be UInt64.",
ErrorCodes::TYPE_MISMATCH
};
}
}
}
void loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
while (const auto block = stream->read())
{
const auto & id_column = *block.getByPosition(0).column;
element_count += id_column.size();
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *block.getByPosition(attribute_idx + 1).column;
auto & attribute = attributes[attribute_idx];
for (const auto row_idx : ext::range(0, id_column.size()))
setAttributeValue(attribute, id_column[row_idx].get<UInt64>(), attribute_column[row_idx]);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY
};
}
template <typename T> template <typename T>
void addAttributeSize(const attribute_t & attribute) void addAttributeSize(const attribute_t & attribute);
{
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
bytes_allocated += sizeof(CollectionType<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
void calculateBytesAllocated() void calculateBytesAllocated();
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
}
template <typename T> template <typename T>
void createAttributeImpl(attribute_t & attribute, const Field & null_value) void createAttributeImpl(attribute_t & attribute, const Field & null_value);
{
std::get<T>(attribute.null_values) = null_value.get<typename NearestFieldType<T>::Type>();
std::get<CollectionPtrType<T>>(attribute.maps) = std::make_unique<CollectionType<T>>();
}
attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
{
attribute_t attr{type};
switch (type) template <typename OutputType, typename ValueSetter, typename DefaultGetter>
{ void getItemsNumber(
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break; const attribute_t & attribute,
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break; const PaddedPODArray<id_t> & ids,
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break; ValueSetter && set_value,
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break; DefaultGetter && get_default) const;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<CollectionPtrType<StringRef>>(attr.maps) = std::make_unique<CollectionType<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr; template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
} void getItemsImpl(
const attribute_t & attribute,
template <typename T, typename ValueSetter, typename DefaultGetter> const PaddedPODArray<id_t> & ids,
void getItems( ValueSetter && set_value,
const attribute_t & attribute, const PaddedPODArray<id_t> & ids, ValueSetter && set_value, DefaultGetter && get_default) const;
DefaultGetter && get_default) const
{
const auto & attr = *std::get<CollectionPtrType<T>>(attribute.maps);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
{
const auto it = attr.find(ids[i]);
set_value(i, it != attr.end() ? it->second : get_default(i));
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T> template <typename T>
void setAttributeValueImpl(attribute_t & attribute, const id_t id, const T value) void setAttributeValueImpl(attribute_t & attribute, const id_t id, const T value);
{
auto & map = *std::get<CollectionPtrType<T>>(attribute.maps);
map.insert({ id, value });
}
void setAttributeValue(attribute_t & attribute, const id_t id, const Field & value) void setAttributeValue(attribute_t & attribute, const id_t id, const Field & value);
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: setAttributeValueImpl<UInt8>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt16: setAttributeValueImpl<UInt16>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt32: setAttributeValueImpl<UInt32>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt64: setAttributeValueImpl<UInt64>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::Int8: setAttributeValueImpl<Int8>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int16: setAttributeValueImpl<Int16>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int32: setAttributeValueImpl<Int32>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int64: setAttributeValueImpl<Int64>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Float32: setAttributeValueImpl<Float32>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::Float64: setAttributeValueImpl<Float64>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::String:
{
auto & map = *std::get<CollectionPtrType<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
map.insert({ id, StringRef{string_in_arena, string.size()} });
break;
}
}
}
const attribute_t & getAttribute(const std::string & attribute_name) const const attribute_t & getAttribute(const std::string & attribute_name) const;
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS
};
return attributes[it->second];
}
template <typename T> template <typename T>
void has(const attribute_t & attribute, const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const void has(const attribute_t & attribute, const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const;
{
const auto & attr = *std::get<CollectionPtrType<T>>(attribute.maps);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
out[i] = attr.find(ids[i]) != std::end(attr);
query_count.fetch_add(rows, std::memory_order_relaxed);
}
const std::string name; const std::string name;
const DictionaryStructure dict_struct; const DictionaryStructure dict_struct;

View File

@ -20,146 +20,32 @@ class MySQLDictionarySource final : public IDictionarySource
public: public:
MySQLDictionarySource(const DictionaryStructure & dict_struct_, MySQLDictionarySource(const DictionaryStructure & dict_struct_,
const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix,
const Block & sample_block) const Block & sample_block);
: dict_struct{dict_struct_},
db{config.getString(config_prefix + ".db", "")},
table{config.getString(config_prefix + ".table")},
where{config.getString(config_prefix + ".where", "")},
dont_check_update_time{config.getBool(config_prefix + ".dont_check_update_time", false)},
sample_block{sample_block},
pool{config, config_prefix},
query_builder{dict_struct, db, table, where},
load_all_query{query_builder.composeLoadAllQuery()}
{}
/// copy-constructor is provided in order to support cloneability /// copy-constructor is provided in order to support cloneability
MySQLDictionarySource(const MySQLDictionarySource & other) MySQLDictionarySource(const MySQLDictionarySource & other);
: dict_struct{other.dict_struct},
db{other.db},
table{other.table},
where{other.where},
dont_check_update_time{other.dont_check_update_time},
sample_block{other.sample_block},
pool{other.pool},
query_builder{dict_struct, db, table, where},
load_all_query{other.load_all_query}, last_modification{other.last_modification}
{}
BlockInputStreamPtr loadAll() override BlockInputStreamPtr loadAll() override;
{
last_modification = getLastModification();
LOG_TRACE(log, load_all_query); BlockInputStreamPtr loadIds(const std::vector<std::uint64_t> & ids) override;
return std::make_shared<MySQLBlockInputStream>(pool.Get(), load_all_query, sample_block, max_block_size);
}
BlockInputStreamPtr loadIds(const std::vector<std::uint64_t> & ids) override
{
/// Здесь не логгируем и не обновляем время модификации, так как запрос может быть большим, и часто задаваться.
const auto query = query_builder.composeLoadIdsQuery(ids);
return std::make_shared<MySQLBlockInputStream>(pool.Get(), query, sample_block, max_block_size);
}
BlockInputStreamPtr loadKeys( BlockInputStreamPtr loadKeys(
const ConstColumnPlainPtrs & key_columns, const std::vector<std::size_t> & requested_rows) override const ConstColumnPlainPtrs & key_columns, const std::vector<std::size_t> & requested_rows) override;
{
/// Здесь не логгируем и не обновляем время модификации, так как запрос может быть большим, и часто задаваться.
const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN); bool isModified() const override;
return std::make_shared<MySQLBlockInputStream>(pool.Get(), query, sample_block, max_block_size);
}
bool isModified() const override bool supportsSelectiveLoad() const override;
{
if (dont_check_update_time)
return true;
return getLastModification() > last_modification; DictionarySourcePtr clone() const override;
}
bool supportsSelectiveLoad() const override { return true; } std::string toString() const override;
DictionarySourcePtr clone() const override { return std::make_unique<MySQLDictionarySource>(*this); }
std::string toString() const override
{
return "MySQL: " + db + '.' + table + (where.empty() ? "" : ", where: " + where);
}
private: private:
Logger * log = &Logger::get("MySQLDictionarySource"); Logger * log = &Logger::get("MySQLDictionarySource");
static std::string quoteForLike(const std::string s) static std::string quoteForLike(const std::string s);
{
std::string tmp;
tmp.reserve(s.size());
for (auto c : s)
{
if (c == '%' || c == '_' || c == '\\')
tmp.push_back('\\');
tmp.push_back(c);
}
std::string res;
{
WriteBufferFromString out(res);
writeQuoted(tmp, out);
}
return res;
}
LocalDateTime getLastModification() const
{
LocalDateTime update_time{std::time(nullptr)};
if (dont_check_update_time)
return update_time;
try
{
auto connection = pool.Get();
auto query = connection->query("SHOW TABLE STATUS LIKE " + quoteForLike(table));
LOG_TRACE(log, query.str());
auto result = query.use();
size_t fetched_rows = 0;
if (auto row = result.fetch())
{
++fetched_rows;
const auto UPDATE_TIME_IDX = 12;
const auto & update_time_value = row[UPDATE_TIME_IDX];
if (!update_time_value.isNull())
{
update_time = update_time_value.getDateTime();
LOG_TRACE(log, "Got update time: " << update_time);
}
/// fetch remaining rows to avoid "commands out of sync" error
while (result.fetch())
++fetched_rows;
}
if (0 == fetched_rows)
LOG_ERROR(log, "Cannot find table in SHOW TABLE STATUS result.");
if (fetched_rows > 1)
LOG_ERROR(log, "Found more than one table in SHOW TABLE STATUS result.");
}
catch (...)
{
tryLogCurrentException("MySQLDictionarySource");
}
/// we suppose failure to get modification time is not an error, therefore return current time
return update_time;
}
LocalDateTime getLastModification() const;
const DictionaryStructure dict_struct; const DictionaryStructure dict_struct;
const std::string db; const std::string db;

View File

@ -21,63 +21,25 @@ class ODBCDictionarySource final : public IDictionarySource
public: public:
ODBCDictionarySource(const DictionaryStructure & dict_struct_, ODBCDictionarySource(const DictionaryStructure & dict_struct_,
const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix,
const Block & sample_block) const Block & sample_block);
: dict_struct{dict_struct_},
db{config.getString(config_prefix + ".db", "")},
table{config.getString(config_prefix + ".table")},
where{config.getString(config_prefix + ".where", "")},
sample_block{sample_block},
pool{std::make_shared<Poco::Data::SessionPool>(
config.getString(config_prefix + ".connector", "ODBC"),
config.getString(config_prefix + ".connection_string"))},
query_builder{dict_struct, db, table, where},
load_all_query{query_builder.composeLoadAllQuery()}
{}
/// copy-constructor is provided in order to support cloneability /// copy-constructor is provided in order to support cloneability
ODBCDictionarySource(const ODBCDictionarySource & other) ODBCDictionarySource(const ODBCDictionarySource & other);
: dict_struct{other.dict_struct},
db{other.db},
table{other.table},
where{other.where},
sample_block{other.sample_block},
pool{other.pool},
query_builder{dict_struct, db, table, where},
load_all_query{other.load_all_query}
{}
BlockInputStreamPtr loadAll() override BlockInputStreamPtr loadAll() override;
{
LOG_TRACE(log, load_all_query);
return std::make_shared<ODBCBlockInputStream>(pool->get(), load_all_query, sample_block, max_block_size);
}
BlockInputStreamPtr loadIds(const std::vector<std::uint64_t> & ids) override BlockInputStreamPtr loadIds(const std::vector<std::uint64_t> & ids) override;
{
const auto query = query_builder.composeLoadIdsQuery(ids);
return std::make_shared<ODBCBlockInputStream>(pool->get(), query, sample_block, max_block_size);
}
BlockInputStreamPtr loadKeys( BlockInputStreamPtr loadKeys(
const ConstColumnPlainPtrs & key_columns, const std::vector<std::size_t> & requested_rows) override const ConstColumnPlainPtrs & key_columns, const std::vector<std::size_t> & requested_rows) override;
{
const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN);
return std::make_shared<ODBCBlockInputStream>(pool->get(), query, sample_block, max_block_size);
}
bool isModified() const override bool isModified() const override;
{
return true;
}
bool supportsSelectiveLoad() const override { return true; } bool supportsSelectiveLoad() const override;
DictionarySourcePtr clone() const override { return std::make_unique<ODBCDictionarySource>(*this); } DictionarySourcePtr clone() const override;
std::string toString() const override std::string toString() const override;
{
return "ODBC: " + db + '.' + table + (where.empty() ? "" : ", where: " + where);
}
private: private:
Logger * log = &Logger::get("ODBCDictionarySource"); Logger * log = &Logger::get("ODBCDictionarySource");

View File

@ -19,29 +19,9 @@ class RangeHashedDictionary final : public IDictionaryBase
public: public:
RangeHashedDictionary( RangeHashedDictionary(
const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr, const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr,
const DictionaryLifetime dict_lifetime, bool require_nonempty) const DictionaryLifetime dict_lifetime, bool require_nonempty);
: name{name}, dict_struct(dict_struct),
source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try RangeHashedDictionary(const RangeHashedDictionary & other);
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
RangeHashedDictionary(const RangeHashedDictionary & other)
: RangeHashedDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{}
std::exception_ptr getCreationException() const override { return creation_exception; } std::exception_ptr getCreationException() const override { return creation_exception; }
@ -82,11 +62,7 @@ public:
#define DECLARE_MULTIPLE_GETTER(TYPE)\ #define DECLARE_MULTIPLE_GETTER(TYPE)\
void get##TYPE(\ void get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<UInt16> & dates,\ const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<UInt16> & dates,\
PaddedPODArray<TYPE> & out) const\ PaddedPODArray<TYPE> & out) const;
{\
const auto & attribute = getAttributeWithType(attribute_name, AttributeUnderlyingType::TYPE);\
getItems<TYPE>(attribute, ids, dates, out);\
}
DECLARE_MULTIPLE_GETTER(UInt8) DECLARE_MULTIPLE_GETTER(UInt8)
DECLARE_MULTIPLE_GETTER(UInt16) DECLARE_MULTIPLE_GETTER(UInt16)
DECLARE_MULTIPLE_GETTER(UInt32) DECLARE_MULTIPLE_GETTER(UInt32)
@ -101,31 +77,7 @@ public:
void getString( void getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<UInt16> & dates, const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<UInt16> & dates,
ColumnString * out) const ColumnString * out) const;
{
const auto & attribute = getAttributeWithType(attribute_name, AttributeUnderlyingType::String);
const auto & attr = *std::get<ptr_t<StringRef>>(attribute.maps);
const auto & null_value = std::get<String>(attribute.null_values);
for (const auto i : ext::range(0, ids.size()))
{
const auto it = attr.find(ids[i]);
if (it != std::end(attr))
{
const auto date = dates[i];
const auto & ranges_and_values = it->second;
const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values),
[date] (const value_t<StringRef> & v) { return v.range.contains(date); });
const auto string_ref = val_it != std::end(ranges_and_values) ? val_it->value : StringRef{null_value};
out->insertData(string_ref.data, string_ref.size);
}
else
out->insertData(null_value.data(), null_value.size());
}
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
}
private: private:
struct range_t : std::pair<UInt16, UInt16> struct range_t : std::pair<UInt16, UInt16>
@ -175,243 +127,44 @@ private:
std::unique_ptr<Arena> string_arena; std::unique_ptr<Arena> string_arena;
}; };
void createAttributes() void createAttributes();
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes) void loadData();
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
throw Exception{
name + ": hierarchical attributes not supported by " + getName() + " dictionary.",
ErrorCodes::BAD_ARGUMENTS
};
}
}
void loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
while (const auto block = stream->read())
{
const auto & id_column = *block.getByPosition(0).column;
const auto & min_range_column = *block.getByPosition(1).column;
const auto & max_range_column = *block.getByPosition(2).column;
element_count += id_column.size();
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *block.getByPosition(attribute_idx + 3).column;
auto & attribute = attributes[attribute_idx];
for (const auto row_idx : ext::range(0, id_column.size()))
setAttributeValue(attribute, id_column[row_idx].get<UInt64>(),
range_t(min_range_column[row_idx].get<UInt64>(), max_range_column[row_idx].get<UInt64>()),
attribute_column[row_idx]);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY
};
}
template <typename T> template <typename T>
void addAttributeSize(const attribute_t & attribute) void addAttributeSize(const attribute_t & attribute);
{
const auto & map_ref = std::get<ptr_t<T>>(attribute.maps);
bytes_allocated += sizeof(collection_t<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
void calculateBytesAllocated() void calculateBytesAllocated();
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
}
template <typename T> template <typename T>
void createAttributeImpl(attribute_t & attribute, const Field & null_value) void createAttributeImpl(attribute_t & attribute, const Field & null_value);
{
std::get<T>(attribute.null_values) = null_value.get<typename NearestFieldType<T>::Type>();
std::get<ptr_t<T>>(attribute.maps) = std::make_unique<collection_t<T>>();
}
attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) attribute_t createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break;
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break;
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break;
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ptr_t<StringRef>>(attr.maps) = std::make_unique<collection_t<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr; template <typename OutputType>
}
template <typename T>
void getItems( void getItems(
const attribute_t & attribute, const PaddedPODArray<id_t> & ids, const PaddedPODArray<UInt16> & dates, const attribute_t & attribute,
PaddedPODArray<T> & out) const const PaddedPODArray<id_t> & ids,
{ const PaddedPODArray<UInt16> & dates,
const auto & attr = *std::get<ptr_t<T>>(attribute.maps); PaddedPODArray<OutputType> & out) const;
const auto null_value = std::get<T>(attribute.null_values);
for (const auto i : ext::range(0, ids.size())) template <typename AttributeType, typename OutputType>
{ void getItemsImpl(
const auto it = attr.find(ids[i]); const attribute_t & attribute,
if (it != std::end(attr)) const PaddedPODArray<id_t> & ids,
{ const PaddedPODArray<UInt16> & dates,
const auto date = dates[i]; PaddedPODArray<OutputType> & out) const;
const auto & ranges_and_values = it->second;
const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values),
[date] (const value_t<T> & v) { return v.range.contains(date); });
out[i] = val_it != std::end(ranges_and_values) ? val_it->value : null_value;
}
else
out[i] = null_value;
}
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
}
template <typename T> template <typename T>
void setAttributeValueImpl(attribute_t & attribute, const id_t id, const range_t & range, const T value) void setAttributeValueImpl(attribute_t & attribute, const id_t id, const range_t & range, const T value);
{
auto & map = *std::get<ptr_t<T>>(attribute.maps);
const auto it = map.find(id);
if (it != map.end()) void setAttributeValue(attribute_t & attribute, const id_t id, const range_t & range, const Field & value);
{
auto & values = it->second;
const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range, const attribute_t & getAttribute(const std::string & attribute_name) const;
[] (const value_t<T> & lhs, const range_t & range) {
return lhs.range < range;
});
values.insert(insert_it, value_t<T>{ range, value }); const attribute_t & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const;
}
else
map.insert({ id, values_t<T>{ value_t<T>{ range, value } } });
}
void setAttributeValue(attribute_t & attribute, const id_t id, const range_t & range, const Field & value)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: setAttributeValueImpl<UInt8>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt16: setAttributeValueImpl<UInt16>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt32: setAttributeValueImpl<UInt32>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt64: setAttributeValueImpl<UInt64>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::Int8: setAttributeValueImpl<Int8>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Int16: setAttributeValueImpl<Int16>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Int32: setAttributeValueImpl<Int32>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Int64: setAttributeValueImpl<Int64>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Float32: setAttributeValueImpl<Float32>(attribute, id, range, value.get<Float64>()); break;
case AttributeUnderlyingType::Float64: setAttributeValueImpl<Float64>(attribute, id, range, value.get<Float64>()); break;
case AttributeUnderlyingType::String:
{
auto & map = *std::get<ptr_t<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
const StringRef string_ref{string_in_arena, string.size()};
const auto it = map.find(id);
if (it != map.end())
{
auto & values = it->second;
const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range,
[] (const value_t<StringRef> & lhs, const range_t & range) {
return lhs.range < range;
});
values.insert(insert_it, value_t<StringRef>{ range, string_ref });
}
else
map.insert({ id, values_t<StringRef>{ value_t<StringRef>{ range, string_ref } } });
break;
}
}
}
const attribute_t & getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS
};
return attributes[it->second];
}
const attribute_t & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const
{
const auto & attribute = getAttribute(name);
if (attribute.type != type)
throw Exception{
name + ": type mismatch: attribute " + name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH
};
return attribute;
}
const std::string name; const std::string name;
const DictionaryStructure dict_struct; const DictionaryStructure dict_struct;

View File

@ -33,6 +33,8 @@ namespace DB
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int DICTIONARIES_WAS_NOT_LOADED; extern const int DICTIONARIES_WAS_NOT_LOADED;
extern const int UNSUPPORTED_METHOD;
extern const int UNKNOWN_TYPE;
} }
/** Функции, использующие словари Яндекс.Метрики /** Функции, использующие словари Яндекс.Метрики
@ -2285,9 +2287,9 @@ private:
auto dict = dictionaries.getDictionary(dict_name_col->getData()); auto dict = dictionaries.getDictionary(dict_name_col->getData());
const auto dict_ptr = dict.get(); const auto dict_ptr = dict.get();
if (!executeDispatch<FlatDictionary>(block, arguments, result, dict_ptr) && if (!executeDispatch<FlatDictionary>(block, arguments, result, dict_ptr)
!executeDispatch<HashedDictionary>(block, arguments, result, dict_ptr) && && !executeDispatch<HashedDictionary>(block, arguments, result, dict_ptr)
!executeDispatch<CacheDictionary>(block, arguments, result, dict_ptr)) && !executeDispatch<CacheDictionary>(block, arguments, result, dict_ptr))
throw Exception{ throw Exception{
"Unsupported dictionary type " + dict_ptr->getTypeName(), "Unsupported dictionary type " + dict_ptr->getTypeName(),
ErrorCodes::UNKNOWN_TYPE}; ErrorCodes::UNKNOWN_TYPE};

View File

@ -47,6 +47,9 @@ struct MergeTreeSettings
/// Через сколько секунд удалять ненужные куски. /// Через сколько секунд удалять ненужные куски.
time_t old_parts_lifetime = 8 * 60; time_t old_parts_lifetime = 8 * 60;
/// Через сколько секунд удалять tmp_-директории.
time_t temporary_directories_lifetime = 86400;
/** Настройки вставок. */ /** Настройки вставок. */
/// Если в таблице хотя бы столько активных кусков, искусственно замедлять вставки в таблицу. /// Если в таблице хотя бы столько активных кусков, искусственно замедлять вставки в таблицу.
@ -69,7 +72,9 @@ struct MergeTreeSettings
size_t max_suspicious_broken_parts = 10; size_t max_suspicious_broken_parts = 10;
/// Не выполнять ALTER, если количество файлов для модификации (удаления, добавления) больше указанного. /// Не выполнять ALTER, если количество файлов для модификации (удаления, добавления) больше указанного.
size_t max_files_to_modify_in_alter_columns = 5; size_t max_files_to_modify_in_alter_columns = 50;
/// Не выполнять ALTER, если количество файлов для удаления больше указанного.
size_t max_files_to_remove_in_alter_columns = 10;
/// Максимальное количество ошибок при загрузке кусков, при котором ReplicatedMergeTree соглашается запускаться. /// Максимальное количество ошибок при загрузке кусков, при котором ReplicatedMergeTree соглашается запускаться.
size_t replicated_max_unexpected_parts = 3; size_t replicated_max_unexpected_parts = 3;
@ -113,12 +118,14 @@ struct MergeTreeSettings
SET_SIZE_T(merge_parts_at_night_inc); SET_SIZE_T(merge_parts_at_night_inc);
SET_SIZE_T(max_replicated_merges_in_queue); SET_SIZE_T(max_replicated_merges_in_queue);
SET_SIZE_T(old_parts_lifetime); SET_SIZE_T(old_parts_lifetime);
SET_SIZE_T(temporary_directories_lifetime);
SET_SIZE_T(parts_to_delay_insert); SET_SIZE_T(parts_to_delay_insert);
SET_DOUBLE(insert_delay_step); SET_DOUBLE(insert_delay_step);
SET_SIZE_T(replicated_deduplication_window); SET_SIZE_T(replicated_deduplication_window);
SET_SIZE_T(replicated_logs_to_keep); SET_SIZE_T(replicated_logs_to_keep);
SET_SIZE_T(max_suspicious_broken_parts); SET_SIZE_T(max_suspicious_broken_parts);
SET_SIZE_T(max_files_to_modify_in_alter_columns); SET_SIZE_T(max_files_to_modify_in_alter_columns);
SET_SIZE_T(max_files_to_remove_in_alter_columns);
SET_SIZE_T(replicated_max_unexpected_parts); SET_SIZE_T(replicated_max_unexpected_parts);
SET_SIZE_T(replicated_max_unexpectedly_merged_parts); SET_SIZE_T(replicated_max_unexpectedly_merged_parts);
SET_SIZE_T(replicated_max_missing_obsolete_parts); SET_SIZE_T(replicated_max_missing_obsolete_parts);

View File

@ -61,6 +61,10 @@ protected:
const NamesAndTypesList & materialized_columns_, const NamesAndTypesList & materialized_columns_,
const NamesAndTypesList & alias_columns_, const NamesAndTypesList & alias_columns_,
const ColumnDefaults & column_defaults_); const ColumnDefaults & column_defaults_);
private:
/// Достать из самого внутреннего подзапроса имя базы данных и таблицы: select_database_name, select_table_name.
void extractDependentTable(const ASTSelectQuery & query);
}; };
} }

View File

@ -91,7 +91,7 @@ void MergingSortedBlockInputStream::init(Block & merged_block, ColumnPlainPtrs &
} }
for (size_t i = 0; i < num_columns; ++i) for (size_t i = 0; i < num_columns; ++i)
merged_columns.push_back(&*merged_block.getByPosition(i).column); merged_columns.emplace_back(merged_block.getByPosition(i).column.get());
} }

View File

@ -0,0 +1,712 @@
#include <DB/Columns/ColumnsNumber.h>
#include <DB/Dictionaries/CacheDictionary.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int BAD_ARGUMENTS;
extern const int UNSUPPORTED_METHOD;
}
static inline std::size_t round_up_to_power_of_two(std::size_t n)
{
--n;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
++n;
return n;
}
static inline std::uint64_t getSeed()
{
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_nsec ^ getpid();
}
inline std::uint64_t CacheDictionary::getCellIdx(const id_t id) const
{
const auto hash = intHash64(id);
const auto idx = hash & (size - 1);
return idx;
}
CacheDictionary::CacheDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime,
const std::size_t size)
: name{name}, dict_struct(dict_struct),
source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
size{round_up_to_power_of_two(size)},
cells{this->size},
rnd_engine{getSeed()}
{
if (!this->source_ptr->supportsSelectiveLoad())
throw Exception{
name + ": source cannot be used with CacheDictionary",
ErrorCodes::UNSUPPORTED_METHOD};
createAttributes();
}
CacheDictionary::CacheDictionary(const CacheDictionary & other)
: CacheDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.size}
{}
void CacheDictionary::toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
getItemsNumber<UInt64>(*hierarchical_attribute, ids, out, [&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void CacheDictionary::get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const\
{\
auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItemsNumber<TYPE>(attribute, ids, out, [&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void CacheDictionary::getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const
{
auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
const auto null_value = StringRef{std::get<String>(attribute.null_values)};
getItemsString(attribute, ids, out, [&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void CacheDictionary::get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\
PaddedPODArray<TYPE> & out) const\
{\
auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, ids, out, [&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void CacheDictionary::getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def,
ColumnString * const out) const
{
auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsString(attribute, ids, out, [&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\
void CacheDictionary::get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE def, PaddedPODArray<TYPE> & out) const\
{\
auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, ids, out, [&] (const std::size_t) { return def; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void CacheDictionary::getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def,
ColumnString * const out) const
{
auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsString(attribute, ids, out, [&] (const std::size_t) { return StringRef{def}; });
}
void CacheDictionary::has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const
{
/// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
MapType<std::vector<std::size_t>> outdated_ids;
const auto rows = ext::size(ids);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. ids do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.id != id || cell.expiresAt() < now)
outdated_ids[id].push_back(row);
else
out[row] = !cell.isDefault();
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
if (outdated_ids.empty())
return;
std::vector<id_t> required_ids(outdated_ids.size());
std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
[] (auto & pair) { return pair.first; });
/// request new values
update(required_ids, [&] (const auto id, const auto) {
for (const auto row : outdated_ids[id])
out[row] = true;
}, [&] (const auto id, const auto) {
for (const auto row : outdated_ids[id])
out[row] = false;
});
}
void CacheDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
bytes_allocated += size * sizeof(cell_metadata_t);
bytes_allocated += size * sizeof(attributes.front());
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
{
hierarchical_attribute = &attributes.back();
if (hierarchical_attribute->type != AttributeUnderlyingType::UInt64)
throw Exception{
name + ": hierarchical attribute must be UInt64.",
ErrorCodes::TYPE_MISMATCH};
}
}
}
CacheDictionary::attribute_t CacheDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8:
std::get<UInt8>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt8>>(attr.arrays) = std::make_unique<ContainerType<UInt8>>(size);
bytes_allocated += size * sizeof(UInt8);
break;
case AttributeUnderlyingType::UInt16:
std::get<UInt16>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt16>>(attr.arrays) = std::make_unique<ContainerType<UInt16>>(size);
bytes_allocated += size * sizeof(UInt16);
break;
case AttributeUnderlyingType::UInt32:
std::get<UInt32>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt32>>(attr.arrays) = std::make_unique<ContainerType<UInt32>>(size);
bytes_allocated += size * sizeof(UInt32);
break;
case AttributeUnderlyingType::UInt64:
std::get<UInt64>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt64>>(attr.arrays) = std::make_unique<ContainerType<UInt64>>(size);
bytes_allocated += size * sizeof(UInt64);
break;
case AttributeUnderlyingType::Int8:
std::get<Int8>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int8>>(attr.arrays) = std::make_unique<ContainerType<Int8>>(size);
bytes_allocated += size * sizeof(Int8);
break;
case AttributeUnderlyingType::Int16:
std::get<Int16>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int16>>(attr.arrays) = std::make_unique<ContainerType<Int16>>(size);
bytes_allocated += size * sizeof(Int16);
break;
case AttributeUnderlyingType::Int32:
std::get<Int32>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int32>>(attr.arrays) = std::make_unique<ContainerType<Int32>>(size);
bytes_allocated += size * sizeof(Int32);
break;
case AttributeUnderlyingType::Int64:
std::get<Int64>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int64>>(attr.arrays) = std::make_unique<ContainerType<Int64>>(size);
bytes_allocated += size * sizeof(Int64);
break;
case AttributeUnderlyingType::Float32:
std::get<Float32>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float32>>(attr.arrays) = std::make_unique<ContainerType<Float32>>(size);
bytes_allocated += size * sizeof(Float32);
break;
case AttributeUnderlyingType::Float64:
std::get<Float64>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float64>>(attr.arrays) = std::make_unique<ContainerType<Float64>>(size);
bytes_allocated += size * sizeof(Float64);
break;
case AttributeUnderlyingType::String:
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.arrays) = std::make_unique<ContainerType<StringRef>>(size);
bytes_allocated += size * sizeof(StringRef);
if (!string_arena)
string_arena = std::make_unique<ArenaWithFreeLists>();
break;
}
return attr;
}
template <typename OutputType, typename DefaultGetter>
void CacheDictionary::getItemsNumber(
attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
PaddedPODArray<OutputType> & out,
DefaultGetter && get_default) const
{
if (false) {}
#define DISPATCH(TYPE) \
else if (attribute.type == AttributeUnderlyingType::TYPE) \
getItemsNumberImpl<TYPE, OutputType>(attribute, ids, out, std::forward<DefaultGetter>(get_default));
DISPATCH(UInt8)
DISPATCH(UInt16)
DISPATCH(UInt32)
DISPATCH(UInt64)
DISPATCH(Int8)
DISPATCH(Int16)
DISPATCH(Int32)
DISPATCH(Int64)
DISPATCH(Float32)
DISPATCH(Float64)
#undef DISPATCH
else
throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
}
template <typename AttributeType, typename OutputType, typename DefaultGetter>
void CacheDictionary::getItemsNumberImpl(
attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
PaddedPODArray<OutputType> & out,
DefaultGetter && get_default) const
{
/// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
MapType<std::vector<std::size_t>> outdated_ids;
auto & attribute_array = std::get<ContainerPtrType<AttributeType>>(attribute.arrays);
const auto rows = ext::size(ids);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. ids do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.id != id || cell.expiresAt() < now)
outdated_ids[id].push_back(row);
else
out[row] = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
if (outdated_ids.empty())
return;
std::vector<id_t> required_ids(outdated_ids.size());
std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
[] (auto & pair) { return pair.first; });
/// request new values
update(required_ids, [&] (const auto id, const auto cell_idx) {
const auto attribute_value = attribute_array[cell_idx];
for (const auto row : outdated_ids[id])
out[row] = attribute_value;
}, [&] (const auto id, const auto cell_idx) {
for (const auto row : outdated_ids[id])
out[row] = get_default(row);
});
}
template <typename DefaultGetter>
void CacheDictionary::getItemsString(
attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
ColumnString * out,
DefaultGetter && get_default) const
{
const auto rows = ext::size(ids);
/// save on some allocations
out->getOffsets().reserve(rows);
auto & attribute_array = std::get<ContainerPtrType<StringRef>>(attribute.arrays);
auto found_outdated_values = false;
/// perform optimistic version, fallback to pessimistic if failed
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, discard on fail
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
if (cell.id != id || cell.expiresAt() < now)
{
found_outdated_values = true;
break;
}
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
out->insertData(string_ref.data, string_ref.size);
}
}
}
/// optimistic code completed successfully
if (!found_outdated_values)
{
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows, std::memory_order_release);
return;
}
/// now onto the pessimistic one, discard possible partial results from the optimistic path
out->getChars().resize_assume_reserved(0);
out->getOffsets().resize_assume_reserved(0);
/// Mapping: <id> -> { all indices `i` of `ids` such that `ids[i]` = <id> }
MapType<std::vector<std::size_t>> outdated_ids;
/// we are going to store every string separately
MapType<String> map;
std::size_t total_length = 0;
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
for (const auto row : ext::range(0, ids.size()))
{
const auto id = ids[row];
const auto cell_idx = getCellIdx(id);
const auto & cell = cells[cell_idx];
if (cell.id != id || cell.expiresAt() < now)
outdated_ids[id].push_back(row);
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
if (!cell.isDefault())
map[id] = String{string_ref};
total_length += string_ref.size + 1;
}
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_ids.size(), std::memory_order_release);
/// request new values
if (!outdated_ids.empty())
{
std::vector<id_t> required_ids(outdated_ids.size());
std::transform(std::begin(outdated_ids), std::end(outdated_ids), std::begin(required_ids),
[] (auto & pair) { return pair.first; });
update(required_ids, [&] (const auto id, const auto cell_idx) {
const auto attribute_value = attribute_array[cell_idx];
map[id] = String{attribute_value};
total_length += (attribute_value.size + 1) * outdated_ids[id].size();
}, [&] (const auto id, const auto cell_idx) {
for (const auto row : outdated_ids[id])
total_length += get_default(row).size + 1;
});
}
out->getChars().reserve(total_length);
for (const auto row : ext::range(0, ext::size(ids)))
{
const auto id = ids[row];
const auto it = map.find(id);
const auto string_ref = it != std::end(map) ? StringRef{it->second} : get_default(row);
out->insertData(string_ref.data, string_ref.size);
}
}
template <typename PresentIdHandler, typename AbsentIdHandler>
void CacheDictionary::update(
const std::vector<id_t> & requested_ids, PresentIdHandler && on_cell_updated,
AbsentIdHandler && on_id_not_found) const
{
MapType<UInt8> remaining_ids{requested_ids.size()};
for (const auto id : requested_ids)
remaining_ids.insert({ id, 0 });
std::uniform_int_distribution<std::uint64_t> distribution{
dict_lifetime.min_sec,
dict_lifetime.max_sec
};
const Poco::ScopedWriteRWLock write_lock{rw_lock};
auto stream = source_ptr->loadIds(requested_ids);
stream->readPrefix();
while (const auto block = stream->read())
{
const auto id_column = typeid_cast<const ColumnUInt64 *>(block.getByPosition(0).column.get());
if (!id_column)
throw Exception{
name + ": id column has type different from UInt64.",
ErrorCodes::TYPE_MISMATCH};
const auto & ids = id_column->getData();
/// cache column pointers
const auto column_ptrs = ext::map<std::vector>(ext::range(0, attributes.size()), [&block] (const auto & i) {
return block.getByPosition(i + 1).column.get();
});
for (const auto i : ext::range(0, ids.size()))
{
const auto id = ids[i];
const auto cell_idx = getCellIdx(id);
auto & cell = cells[cell_idx];
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *column_ptrs[attribute_idx];
auto & attribute = attributes[attribute_idx];
setAttributeValue(attribute, cell_idx, attribute_column[i]);
}
/// if cell id is zero and zero does not map to this cell, then the cell is unused
if (cell.id == 0 && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
cell.id = id;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
/// inform caller
on_cell_updated(id, cell_idx);
/// mark corresponding id as found
remaining_ids[id] = 1;
}
}
stream->readSuffix();
/// Check which ids have not been found and require setting null_value
for (const auto id_found_pair : remaining_ids)
{
if (id_found_pair.second)
continue;
const auto id = id_found_pair.first;
const auto cell_idx = getCellIdx(id);
auto & cell = cells[cell_idx];
/// Set null_value for each attribute
for (auto & attribute : attributes)
setDefaultAttributeValue(attribute, cell_idx);
/// Check if cell had not been occupied before and increment element counter if it hadn't
if (cell.id == 0 && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
cell.id = id;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
cell.setDefault();
/// inform caller that the cell has not been found
on_id_not_found(id, cell_idx);
}
}
void CacheDictionary::setDefaultAttributeValue(attribute_t & attribute, const id_t idx) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = std::get<UInt8>(attribute.null_values); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = std::get<UInt16>(attribute.null_values); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = std::get<UInt32>(attribute.null_values); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = std::get<UInt64>(attribute.null_values); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = std::get<Int8>(attribute.null_values); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = std::get<Int16>(attribute.null_values); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = std::get<Int32>(attribute.null_values); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = std::get<Int64>(attribute.null_values); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = std::get<Float32>(attribute.null_values); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = std::get<Float64>(attribute.null_values); break;
case AttributeUnderlyingType::String:
{
const auto & null_value_ref = std::get<String>(attribute.null_values);
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
if (string_ref.data != null_value_ref.data())
{
if (string_ref.data)
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
string_ref = StringRef{null_value_ref};
}
break;
}
}
}
void CacheDictionary::setAttributeValue(attribute_t & attribute, const id_t idx, const Field & value) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::String:
{
const auto & string = value.get<String>();
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
const auto & null_value_ref = std::get<String>(attribute.null_values);
/// free memory unless it points to a null_value
if (string_ref.data && string_ref.data != null_value_ref.data())
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
const auto size = string.size();
if (size != 0)
{
auto string_ptr = string_arena->alloc(size + 1);
std::copy(string.data(), string.data() + size + 1, string_ptr);
string_ref = StringRef{string_ptr, size};
}
else
string_ref = {};
break;
}
}
}
CacheDictionary::attribute_t & CacheDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS
};
return attributes[it->second];
}
}

View File

@ -0,0 +1,829 @@
#include <DB/Dictionaries/ComplexKeyCacheDictionary.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int BAD_ARGUMENTS;
extern const int UNSUPPORTED_METHOD;
}
static inline std::size_t round_up_to_power_of_two(std::size_t n)
{
--n;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
++n;
return n;
}
static inline std::uint64_t getSeed()
{
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_nsec ^ getpid();
}
inline std::uint64_t ComplexKeyCacheDictionary::getCellIdx(const StringRef key) const
{
const auto hash = StringRefHash{}(key);
const auto idx = hash & (size - 1);
return idx;
}
ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime,
const std::size_t size)
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
size{round_up_to_power_of_two(size)}, rnd_engine{getSeed()}
{
if (!this->source_ptr->supportsSelectiveLoad())
throw Exception{
name + ": source cannot be used with ComplexKeyCacheDictionary",
ErrorCodes::UNSUPPORTED_METHOD};
createAttributes();
}
ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(const ComplexKeyCacheDictionary & other)
: ComplexKeyCacheDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.size}
{}
#define DECLARE(TYPE)\
void ComplexKeyCacheDictionary::get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
PaddedPODArray<TYPE> & out) const\
{\
dict_struct.validateKeyTypes(key_types);\
\
auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItemsNumber<TYPE>(attribute, key_columns, out, [&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void ComplexKeyCacheDictionary::getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
ColumnString * out) const
{
dict_struct.validateKeyTypes(key_types);
auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
const auto null_value = StringRef{std::get<String>(attribute.null_values)};
getItemsString(attribute, key_columns, out, [&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void ComplexKeyCacheDictionary::get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const\
{\
dict_struct.validateKeyTypes(key_types);\
\
auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, key_columns, out, [&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void ComplexKeyCacheDictionary::getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const ColumnString * const def, ColumnString * const out) const
{
dict_struct.validateKeyTypes(key_types);
auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsString(attribute, key_columns, out, [&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\
void ComplexKeyCacheDictionary::get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const TYPE def, PaddedPODArray<TYPE> & out) const\
{\
dict_struct.validateKeyTypes(key_types);\
\
auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, key_columns, out, [&] (const std::size_t) { return def; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void ComplexKeyCacheDictionary::getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const String & def, ColumnString * const out) const
{
dict_struct.validateKeyTypes(key_types);
auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsString(attribute, key_columns, out, [&] (const std::size_t) { return StringRef{def}; });
}
void ComplexKeyCacheDictionary::has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const
{
dict_struct.validateKeyTypes(key_types);
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<std::size_t>> outdated_keys;
const auto rows = key_columns.front()->size();
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
outdated_keys[key].push_back(row);
else
out[row] = !cell.isDefault();
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return;
std::vector<std::size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows),
[] (auto & pair) { return pair.second.front(); });
/// request new values
update(key_columns, keys_array, required_rows, [&] (const auto key, const auto) {
for (const auto out_idx : outdated_keys[key])
out[out_idx] = true;
}, [&] (const auto key, const auto) {
for (const auto out_idx : outdated_keys[key])
out[out_idx] = false;
});
}
void ComplexKeyCacheDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
bytes_allocated += size * sizeof(cell_metadata_t);
bytes_allocated += size * sizeof(attributes.front());
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
throw Exception{
name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
ErrorCodes::TYPE_MISMATCH};
}
}
ComplexKeyCacheDictionary::attribute_t ComplexKeyCacheDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8:
std::get<UInt8>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt8>>(attr.arrays) = std::make_unique<ContainerType<UInt8>>(size);
bytes_allocated += size * sizeof(UInt8);
break;
case AttributeUnderlyingType::UInt16:
std::get<UInt16>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt16>>(attr.arrays) = std::make_unique<ContainerType<UInt16>>(size);
bytes_allocated += size * sizeof(UInt16);
break;
case AttributeUnderlyingType::UInt32:
std::get<UInt32>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt32>>(attr.arrays) = std::make_unique<ContainerType<UInt32>>(size);
bytes_allocated += size * sizeof(UInt32);
break;
case AttributeUnderlyingType::UInt64:
std::get<UInt64>(attr.null_values) = null_value.get<UInt64>();
std::get<ContainerPtrType<UInt64>>(attr.arrays) = std::make_unique<ContainerType<UInt64>>(size);
bytes_allocated += size * sizeof(UInt64);
break;
case AttributeUnderlyingType::Int8:
std::get<Int8>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int8>>(attr.arrays) = std::make_unique<ContainerType<Int8>>(size);
bytes_allocated += size * sizeof(Int8);
break;
case AttributeUnderlyingType::Int16:
std::get<Int16>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int16>>(attr.arrays) = std::make_unique<ContainerType<Int16>>(size);
bytes_allocated += size * sizeof(Int16);
break;
case AttributeUnderlyingType::Int32:
std::get<Int32>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int32>>(attr.arrays) = std::make_unique<ContainerType<Int32>>(size);
bytes_allocated += size * sizeof(Int32);
break;
case AttributeUnderlyingType::Int64:
std::get<Int64>(attr.null_values) = null_value.get<Int64>();
std::get<ContainerPtrType<Int64>>(attr.arrays) = std::make_unique<ContainerType<Int64>>(size);
bytes_allocated += size * sizeof(Int64);
break;
case AttributeUnderlyingType::Float32:
std::get<Float32>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float32>>(attr.arrays) = std::make_unique<ContainerType<Float32>>(size);
bytes_allocated += size * sizeof(Float32);
break;
case AttributeUnderlyingType::Float64:
std::get<Float64>(attr.null_values) = null_value.get<Float64>();
std::get<ContainerPtrType<Float64>>(attr.arrays) = std::make_unique<ContainerType<Float64>>(size);
bytes_allocated += size * sizeof(Float64);
break;
case AttributeUnderlyingType::String:
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.arrays) = std::make_unique<ContainerType<StringRef>>(size);
bytes_allocated += size * sizeof(StringRef);
if (!string_arena)
string_arena = std::make_unique<ArenaWithFreeLists>();
break;
}
return attr;
}
template <typename OutputType, typename DefaultGetter>
void ComplexKeyCacheDictionary::getItemsNumber(
attribute_t & attribute,
const ConstColumnPlainPtrs & key_columns,
PaddedPODArray<OutputType> & out,
DefaultGetter && get_default) const
{
if (false) {}
#define DISPATCH(TYPE) \
else if (attribute.type == AttributeUnderlyingType::TYPE) \
getItemsNumberImpl<TYPE, OutputType>(attribute, key_columns, out, std::forward<DefaultGetter>(get_default));
DISPATCH(UInt8)
DISPATCH(UInt16)
DISPATCH(UInt32)
DISPATCH(UInt64)
DISPATCH(Int8)
DISPATCH(Int16)
DISPATCH(Int32)
DISPATCH(Int64)
DISPATCH(Float32)
DISPATCH(Float64)
#undef DISPATCH
else
throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
}
template <typename AttributeType, typename OutputType, typename DefaultGetter>
void ComplexKeyCacheDictionary::getItemsNumberImpl(
attribute_t & attribute,
const ConstColumnPlainPtrs & key_columns,
PaddedPODArray<OutputType> & out,
DefaultGetter && get_default) const
{
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<std::size_t>> outdated_keys;
auto & attribute_array = std::get<ContainerPtrType<AttributeType>>(attribute.arrays);
const auto rows = key_columns.front()->size();
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows);
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
outdated_keys[key].push_back(row);
else
out[row] = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return;
std::vector<std::size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows),
[] (auto & pair) { return pair.second.front(); });
/// request new values
update(key_columns, keys_array, required_rows, [&] (const auto key, const auto cell_idx) {
for (const auto row : outdated_keys[key])
out[row] = attribute_array[cell_idx];
}, [&] (const auto key, const auto cell_idx) {
for (const auto row : outdated_keys[key])
out[row] = get_default(row);
});
}
template <typename DefaultGetter>
void ComplexKeyCacheDictionary::getItemsString(
attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, ColumnString * out,
DefaultGetter && get_default) const
{
const auto rows = key_columns.front()->size();
/// save on some allocations
out->getOffsets().reserve(rows);
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
auto & attribute_array = std::get<ContainerPtrType<StringRef>>(attribute.arrays);
auto found_outdated_values = false;
/// perform optimistic version, fallback to pessimistic if failed
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, discard on fail
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
SCOPE_EXIT(temporary_keys_pool.rollback(key.size));
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
{
found_outdated_values = true;
break;
}
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
out->insertData(string_ref.data, string_ref.size);
}
}
}
/// optimistic code completed successfully
if (!found_outdated_values)
{
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows, std::memory_order_release);
return;
}
/// now onto the pessimistic one, discard possible partial results from the optimistic path
out->getChars().resize_assume_reserved(0);
out->getOffsets().resize_assume_reserved(0);
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<std::size_t>> outdated_keys;
/// we are going to store every string separately
MapType<String> map;
PODArray<StringRef> keys_array(rows);
std::size_t total_length = 0;
{
const Poco::ScopedReadRWLock read_lock{rw_lock};
const auto now = std::chrono::system_clock::now();
for (const auto row : ext::range(0, rows))
{
const auto key = placeKeysInPool(row, key_columns, keys, temporary_keys_pool);
keys_array[row] = key;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
const auto & cell = cells[cell_idx];
if (cell.hash != hash || cell.key != key || cell.expiresAt() < now)
outdated_keys[key].push_back(row);
else
{
const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx];
if (!cell.isDefault())
map[key] = String{string_ref};
total_length += string_ref.size + 1;
}
}
}
query_count.fetch_add(rows, std::memory_order_relaxed);
hit_count.fetch_add(rows - outdated_keys.size(), std::memory_order_release);
/// request new values
if (!outdated_keys.empty())
{
std::vector<std::size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows),
[] (auto & pair) { return pair.second.front(); });
update(key_columns, keys_array, required_rows, [&] (const auto key, const auto cell_idx) {
const auto attribute_value = attribute_array[cell_idx];
map[key] = String{attribute_value};
total_length += (attribute_value.size + 1) * outdated_keys[key].size();
}, [&] (const auto key, const auto cell_idx) {
for (const auto row : outdated_keys[key])
total_length += get_default(row).size + 1;
});
}
out->getChars().reserve(total_length);
for (const auto row : ext::range(0, ext::size(keys_array)))
{
const auto key = keys_array[row];
const auto it = map.find(key);
const auto string_ref = it != std::end(map) ? StringRef{it->second} : get_default(row);
out->insertData(string_ref.data, string_ref.size);
}
}
template <typename PresentKeyHandler, typename AbsentKeyHandler>
void ComplexKeyCacheDictionary::update(
const ConstColumnPlainPtrs & in_key_columns, const PODArray<StringRef> & in_keys,
const std::vector<std::size_t> & in_requested_rows, PresentKeyHandler && on_cell_updated,
AbsentKeyHandler && on_key_not_found) const
{
MapType<bool> remaining_keys{in_requested_rows.size()};
for (const auto row : in_requested_rows)
remaining_keys.insert({ in_keys[row], false });
std::uniform_int_distribution<std::uint64_t> distribution{
dict_lifetime.min_sec,
dict_lifetime.max_sec
};
const Poco::ScopedWriteRWLock write_lock{rw_lock};
auto stream = source_ptr->loadKeys(in_key_columns, in_requested_rows);
stream->readPrefix();
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
while (const auto block = stream->read())
{
/// cache column pointers
const auto key_columns = ext::map<ConstColumnPlainPtrs>(ext::range(0, keys_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(attribute_idx).column.get();
});
const auto attribute_columns = ext::map<ConstColumnPlainPtrs>(ext::range(0, attributes_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(keys_size + attribute_idx).column.get();
});
const auto rows = block.rowsInFirstColumn();
for (const auto row : ext::range(0, rows))
{
auto key = allocKey(row, key_columns, keys);
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
auto & cell = cells[cell_idx];
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *attribute_columns[attribute_idx];
auto & attribute = attributes[attribute_idx];
setAttributeValue(attribute, cell_idx, attribute_column[row]);
}
/// if cell id is zero and zero does not map to this cell, then the cell is unused
if (cell.key == StringRef{} && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
/// handle memory allocated for old key
if (key == cell.key)
{
freeKey(key);
key = cell.key;
}
else
{
/// new key is different from the old one
if (cell.key.data)
freeKey(cell.key);
cell.key = key;
}
cell.hash = hash;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
/// inform caller
on_cell_updated(key, cell_idx);
/// mark corresponding id as found
remaining_keys[key] = true;
}
}
stream->readSuffix();
/// Check which ids have not been found and require setting null_value
for (const auto key_found_pair : remaining_keys)
{
if (key_found_pair.second)
continue;
auto key = key_found_pair.first;
const auto hash = StringRefHash{}(key);
const auto cell_idx = hash & (size - 1);
auto & cell = cells[cell_idx];
/// Set null_value for each attribute
for (auto & attribute : attributes)
setDefaultAttributeValue(attribute, cell_idx);
/// Check if cell had not been occupied before and increment element counter if it hadn't
if (cell.key == StringRef{} && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
if (key == cell.key)
key = cell.key;
else
{
if (cell.key.data)
freeKey(cell.key);
/// copy key from temporary pool
key = copyKey(key);
cell.key = key;
}
cell.hash = hash;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
cell.setDefault();
/// inform caller that the cell has not been found
on_key_not_found(key, cell_idx);
}
}
void ComplexKeyCacheDictionary::setDefaultAttributeValue(attribute_t & attribute, const std::size_t idx) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = std::get<UInt8>(attribute.null_values); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = std::get<UInt16>(attribute.null_values); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = std::get<UInt32>(attribute.null_values); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = std::get<UInt64>(attribute.null_values); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = std::get<Int8>(attribute.null_values); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = std::get<Int16>(attribute.null_values); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = std::get<Int32>(attribute.null_values); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = std::get<Int64>(attribute.null_values); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = std::get<Float32>(attribute.null_values); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = std::get<Float64>(attribute.null_values); break;
case AttributeUnderlyingType::String:
{
const auto & null_value_ref = std::get<String>(attribute.null_values);
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
if (string_ref.data != null_value_ref.data())
{
if (string_ref.data)
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
string_ref = StringRef{null_value_ref};
}
break;
}
}
}
void ComplexKeyCacheDictionary::setAttributeValue(attribute_t & attribute, const std::size_t idx, const Field & value) const
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: std::get<ContainerPtrType<UInt8>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt16: std::get<ContainerPtrType<UInt16>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt32: std::get<ContainerPtrType<UInt32>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::UInt64: std::get<ContainerPtrType<UInt64>>(attribute.arrays)[idx] = value.get<UInt64>(); break;
case AttributeUnderlyingType::Int8: std::get<ContainerPtrType<Int8>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int16: std::get<ContainerPtrType<Int16>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int32: std::get<ContainerPtrType<Int32>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Int64: std::get<ContainerPtrType<Int64>>(attribute.arrays)[idx] = value.get<Int64>(); break;
case AttributeUnderlyingType::Float32: std::get<ContainerPtrType<Float32>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::Float64: std::get<ContainerPtrType<Float64>>(attribute.arrays)[idx] = value.get<Float64>(); break;
case AttributeUnderlyingType::String:
{
const auto & string = value.get<String>();
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
const auto & null_value_ref = std::get<String>(attribute.null_values);
/// free memory unless it points to a null_value
if (string_ref.data && string_ref.data != null_value_ref.data())
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
const auto size = string.size();
if (size != 0)
{
auto string_ptr = string_arena->alloc(size + 1);
std::copy(string.data(), string.data() + size + 1, string_ptr);
string_ref = StringRef{string_ptr, size};
}
else
string_ref = {};
break;
}
}
}
ComplexKeyCacheDictionary::attribute_t & ComplexKeyCacheDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
StringRef ComplexKeyCacheDictionary::allocKey(const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys) const
{
if (key_size_is_fixed)
return placeKeysInFixedSizePool(row, key_columns);
return placeKeysInPool(row, key_columns, keys, *keys_pool);
}
void ComplexKeyCacheDictionary::freeKey(const StringRef key) const
{
if (key_size_is_fixed)
fixed_size_keys_pool->free(const_cast<char *>(key.data));
else
keys_pool->free(const_cast<char *>(key.data), key.size);
}
template <typename Arena>
StringRef ComplexKeyCacheDictionary::placeKeysInPool(
const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys, Arena & pool)
{
const auto keys_size = key_columns.size();
size_t sum_keys_size{};
for (const auto i : ext::range(0, keys_size))
{
keys[i] = key_columns[i]->getDataAtWithTerminatingZero(row);
sum_keys_size += keys[i].size;
}
const auto res = pool.alloc(sum_keys_size);
auto place = res;
for (size_t j = 0; j < keys_size; ++j)
{
memcpy(place, keys[j].data, keys[j].size);
place += keys[j].size;
}
return { res, sum_keys_size };
}
StringRef ComplexKeyCacheDictionary::placeKeysInFixedSizePool(
const std::size_t row, const ConstColumnPlainPtrs & key_columns) const
{
const auto res = fixed_size_keys_pool->alloc();
auto place = res;
for (const auto & key_column : key_columns)
{
const auto key = key_column->getDataAt(row);
memcpy(place, key.data, key.size);
place += key.size;
}
return { res, key_size };
}
StringRef ComplexKeyCacheDictionary::copyKey(const StringRef key) const
{
const auto res = key_size_is_fixed ? fixed_size_keys_pool->alloc() : keys_pool->alloc(key.size);
memcpy(res, key.data, key.size);
return { res, key.size };
}
}

View File

@ -0,0 +1,505 @@
#include <ext/map.hpp>
#include <ext/range.hpp>
#include <DB/Dictionaries/ComplexKeyHashedDictionary.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
}
ComplexKeyHashedDictionary::ComplexKeyHashedDictionary(
const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr,
const DictionaryLifetime dict_lifetime, bool require_nonempty)
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
ComplexKeyHashedDictionary::ComplexKeyHashedDictionary(const ComplexKeyHashedDictionary & other)
: ComplexKeyHashedDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{
}
#define DECLARE(TYPE)\
void ComplexKeyHashedDictionary::get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
PaddedPODArray<TYPE> & out) const\
{\
dict_struct.validateKeyTypes(key_types);\
\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItemsNumber<TYPE>(attribute, key_columns,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void ComplexKeyHashedDictionary::getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
ColumnString * out) const
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
const auto & null_value = StringRef{std::get<String>(attribute.null_values)};
getItemsImpl<StringRef, StringRef>(attribute, key_columns,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void ComplexKeyHashedDictionary::get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const PaddedPODArray<TYPE> & def, PaddedPODArray<TYPE> & out) const\
{\
dict_struct.validateKeyTypes(key_types);\
\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, key_columns,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void ComplexKeyHashedDictionary::getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const ColumnString * const def, ColumnString * const out) const
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsImpl<StringRef, StringRef>(attribute, key_columns,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\
void ComplexKeyHashedDictionary::get##TYPE(\
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,\
const TYPE def, PaddedPODArray<TYPE> & out) const\
{\
dict_struct.validateKeyTypes(key_types);\
\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, key_columns,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return def; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void ComplexKeyHashedDictionary::getString(
const std::string & attribute_name, const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types,
const String & def, ColumnString * const out) const
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsImpl<StringRef, StringRef>(attribute, key_columns,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return StringRef{def}; });
}
void ComplexKeyHashedDictionary::has(const ConstColumnPlainPtrs & key_columns, const DataTypes & key_types, PaddedPODArray<UInt8> & out) const
{
dict_struct.validateKeyTypes(key_types);
const auto & attribute = attributes.front();
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: has<UInt8>(attribute, key_columns, out); break;
case AttributeUnderlyingType::UInt16: has<UInt16>(attribute, key_columns, out); break;
case AttributeUnderlyingType::UInt32: has<UInt32>(attribute, key_columns, out); break;
case AttributeUnderlyingType::UInt64: has<UInt64>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int8: has<Int8>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int16: has<Int16>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int32: has<Int32>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Int64: has<Int64>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Float32: has<Float32>(attribute, key_columns, out); break;
case AttributeUnderlyingType::Float64: has<Float64>(attribute, key_columns, out); break;
case AttributeUnderlyingType::String: has<StringRef>(attribute, key_columns, out); break;
}
}
void ComplexKeyHashedDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
throw Exception{
name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
ErrorCodes::TYPE_MISMATCH};
}
}
void ComplexKeyHashedDictionary::loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
/// created upfront to avoid excess allocations
const auto keys_size = dict_struct.key.value().size();
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
while (const auto block = stream->read())
{
const auto rows = block.rowsInFirstColumn();
element_count += rows;
const auto key_column_ptrs = ext::map<ConstColumnPlainPtrs>(ext::range(0, keys_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(attribute_idx).column.get();
});
const auto attribute_column_ptrs = ext::map<ConstColumnPlainPtrs>(ext::range(0, attributes_size),
[&] (const std::size_t attribute_idx) {
return block.getByPosition(keys_size + attribute_idx).column.get();
});
for (const auto row_idx : ext::range(0, rows))
{
/// calculate key once per row
const auto key = placeKeysInPool(row_idx, key_column_ptrs, keys, keys_pool);
auto should_rollback = false;
for (const auto attribute_idx : ext::range(0, attributes_size))
{
const auto & attribute_column = *attribute_column_ptrs[attribute_idx];
auto & attribute = attributes[attribute_idx];
const auto inserted = setAttributeValue(attribute, key, attribute_column[row_idx]);
if (!inserted)
should_rollback = true;
}
/// @note on multiple equal keys the mapped value for the first one is stored
if (should_rollback)
keys_pool.rollback(key.size);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY};
}
template <typename T>
void ComplexKeyHashedDictionary::addAttributeSize(const attribute_t & attribute)
{
const auto & map_ref = std::get<ContainerPtrType<T>>(attribute.maps);
bytes_allocated += sizeof(ContainerType<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
void ComplexKeyHashedDictionary::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
bytes_allocated += keys_pool.size();
}
template <typename T>
void ComplexKeyHashedDictionary::createAttributeImpl(attribute_t & attribute, const Field & null_value)
{
std::get<T>(attribute.null_values) = null_value.get<typename NearestFieldType<T>::Type>();
std::get<ContainerPtrType<T>>(attribute.maps) = std::make_unique<ContainerType<T>>();
}
ComplexKeyHashedDictionary::attribute_t ComplexKeyHashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break;
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break;
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break;
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.maps) = std::make_unique<ContainerType<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr;
}
template <typename OutputType, typename ValueSetter, typename DefaultGetter>
void ComplexKeyHashedDictionary::getItemsNumber(
const attribute_t & attribute,
const ConstColumnPlainPtrs & key_columns,
ValueSetter && set_value,
DefaultGetter && get_default) const
{
if (false) {}
#define DISPATCH(TYPE) \
else if (attribute.type == AttributeUnderlyingType::TYPE) \
getItemsImpl<TYPE, OutputType>(attribute, key_columns, std::forward<ValueSetter>(set_value), std::forward<DefaultGetter>(get_default));
DISPATCH(UInt8)
DISPATCH(UInt16)
DISPATCH(UInt32)
DISPATCH(UInt64)
DISPATCH(Int8)
DISPATCH(Int16)
DISPATCH(Int32)
DISPATCH(Int64)
DISPATCH(Float32)
DISPATCH(Float64)
#undef DISPATCH
else
throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void ComplexKeyHashedDictionary::getItemsImpl(
const attribute_t & attribute,
const ConstColumnPlainPtrs & key_columns,
ValueSetter && set_value,
DefaultGetter && get_default) const
{
const auto & attr = *std::get<ContainerPtrType<AttributeType>>(attribute.maps);
const auto keys_size = key_columns.size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
const auto rows = key_columns.front()->size();
for (const auto i : ext::range(0, rows))
{
/// copy key data to arena so it is contiguous and return StringRef to it
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
const auto it = attr.find(key);
set_value(i, it != attr.end() ? it->second : get_default(i));
/// free memory allocated for the key
temporary_keys_pool.rollback(key.size);
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T>
bool ComplexKeyHashedDictionary::setAttributeValueImpl(attribute_t & attribute, const StringRef key, const T value)
{
auto & map = *std::get<ContainerPtrType<T>>(attribute.maps);
const auto pair = map.insert({ key, value });
return pair.second;
}
bool ComplexKeyHashedDictionary::setAttributeValue(attribute_t & attribute, const StringRef key, const Field & value)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: return setAttributeValueImpl<UInt8>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::UInt16: return setAttributeValueImpl<UInt16>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::UInt32: return setAttributeValueImpl<UInt32>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::UInt64: return setAttributeValueImpl<UInt64>(attribute, key, value.get<UInt64>());
case AttributeUnderlyingType::Int8: return setAttributeValueImpl<Int8>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Int16: return setAttributeValueImpl<Int16>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Int32: return setAttributeValueImpl<Int32>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Int64: return setAttributeValueImpl<Int64>(attribute, key, value.get<Int64>());
case AttributeUnderlyingType::Float32: return setAttributeValueImpl<Float32>(attribute, key, value.get<Float64>());
case AttributeUnderlyingType::Float64: return setAttributeValueImpl<Float64>(attribute, key, value.get<Float64>());
case AttributeUnderlyingType::String:
{
auto & map = *std::get<ContainerPtrType<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
const auto pair = map.insert({ key, StringRef{string_in_arena, string.size()} });
return pair.second;
}
}
return {};
}
const ComplexKeyHashedDictionary::attribute_t & ComplexKeyHashedDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
StringRef ComplexKeyHashedDictionary::placeKeysInPool(
const std::size_t row, const ConstColumnPlainPtrs & key_columns, StringRefs & keys, Arena & pool)
{
const auto keys_size = key_columns.size();
size_t sum_keys_size{};
for (const auto i : ext::range(0, keys_size))
{
keys[i] = key_columns[i]->getDataAtWithTerminatingZero(row);
sum_keys_size += keys[i].size;
}
const auto res = pool.alloc(sum_keys_size);
auto place = res;
for (size_t j = 0; j < keys_size; ++j)
{
memcpy(place, keys[j].data, keys[j].size);
place += keys[j].size;
}
return { res, sum_keys_size };
}
template <typename T>
void ComplexKeyHashedDictionary::has(const attribute_t & attribute, const ConstColumnPlainPtrs & key_columns, PaddedPODArray<UInt8> & out) const
{
const auto & attr = *std::get<ContainerPtrType<T>>(attribute.maps);
const auto keys_size = key_columns.size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
const auto rows = key_columns.front()->size();
for (const auto i : ext::range(0, rows))
{
/// copy key data to arena so it is contiguous and return StringRef to it
const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool);
const auto it = attr.find(key);
out[i] = it != attr.end();
/// free memory allocated for the key
temporary_keys_pool.rollback(key.size);
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
}

View File

@ -0,0 +1,305 @@
#include <DB/Dictionaries/DictionaryStructure.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_TYPE;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int TYPE_MISMATCH;
}
bool isAttributeTypeConvertibleTo(AttributeUnderlyingType from, AttributeUnderlyingType to)
{
if (from == to)
return true;
/** Это перечисление может быть несколько неполным и смысл может не совпадать с NumberTraits.h.
* (например тем, что целые числа нельзя преобразовать во float-ы)
* Это нормально для ограниченной области применения.
*/
if ( (from == AttributeUnderlyingType::UInt8 && to == AttributeUnderlyingType::UInt16)
|| (from == AttributeUnderlyingType::UInt8 && to == AttributeUnderlyingType::UInt32)
|| (from == AttributeUnderlyingType::UInt8 && to == AttributeUnderlyingType::UInt64)
|| (from == AttributeUnderlyingType::UInt16 && to == AttributeUnderlyingType::UInt32)
|| (from == AttributeUnderlyingType::UInt16 && to == AttributeUnderlyingType::UInt64)
|| (from == AttributeUnderlyingType::UInt32 && to == AttributeUnderlyingType::UInt64)
|| (from == AttributeUnderlyingType::UInt8 && to == AttributeUnderlyingType::Int16)
|| (from == AttributeUnderlyingType::UInt8 && to == AttributeUnderlyingType::Int32)
|| (from == AttributeUnderlyingType::UInt8 && to == AttributeUnderlyingType::Int64)
|| (from == AttributeUnderlyingType::UInt16 && to == AttributeUnderlyingType::Int32)
|| (from == AttributeUnderlyingType::UInt16 && to == AttributeUnderlyingType::Int64)
|| (from == AttributeUnderlyingType::UInt32 && to == AttributeUnderlyingType::Int64)
|| (from == AttributeUnderlyingType::Int8 && to == AttributeUnderlyingType::Int16)
|| (from == AttributeUnderlyingType::Int8 && to == AttributeUnderlyingType::Int32)
|| (from == AttributeUnderlyingType::Int8 && to == AttributeUnderlyingType::Int64)
|| (from == AttributeUnderlyingType::Int16 && to == AttributeUnderlyingType::Int32)
|| (from == AttributeUnderlyingType::Int16 && to == AttributeUnderlyingType::Int64)
|| (from == AttributeUnderlyingType::Int32 && to == AttributeUnderlyingType::Int64)
|| (from == AttributeUnderlyingType::Float32 && to == AttributeUnderlyingType::Float64))
{
return true;
}
return false;
}
AttributeUnderlyingType getAttributeUnderlyingType(const std::string & type)
{
static const std::unordered_map<std::string, AttributeUnderlyingType> dictionary{
{ "UInt8", AttributeUnderlyingType::UInt8 },
{ "UInt16", AttributeUnderlyingType::UInt16 },
{ "UInt32", AttributeUnderlyingType::UInt32 },
{ "UInt64", AttributeUnderlyingType::UInt64 },
{ "Int8", AttributeUnderlyingType::Int8 },
{ "Int16", AttributeUnderlyingType::Int16 },
{ "Int32", AttributeUnderlyingType::Int32 },
{ "Int64", AttributeUnderlyingType::Int64 },
{ "Float32", AttributeUnderlyingType::Float32 },
{ "Float64", AttributeUnderlyingType::Float64 },
{ "String", AttributeUnderlyingType::String },
{ "Date", AttributeUnderlyingType::UInt16 },
{ "DateTime", AttributeUnderlyingType::UInt32 },
};
const auto it = dictionary.find(type);
if (it != std::end(dictionary))
return it->second;
throw Exception{
"Unknown type " + type,
ErrorCodes::UNKNOWN_TYPE};
}
std::string toString(const AttributeUnderlyingType type)
{
switch (type)
{
case AttributeUnderlyingType::UInt8: return "UInt8";
case AttributeUnderlyingType::UInt16: return "UInt16";
case AttributeUnderlyingType::UInt32: return "UInt32";
case AttributeUnderlyingType::UInt64: return "UInt64";
case AttributeUnderlyingType::Int8: return "Int8";
case AttributeUnderlyingType::Int16: return "Int16";
case AttributeUnderlyingType::Int32: return "Int32";
case AttributeUnderlyingType::Int64: return "Int64";
case AttributeUnderlyingType::Float32: return "Float32";
case AttributeUnderlyingType::Float64: return "Float64";
case AttributeUnderlyingType::String: return "String";
}
throw Exception{
"Unknown attribute_type " + toString(static_cast<int>(type)),
ErrorCodes::ARGUMENT_OUT_OF_BOUND};
}
DictionaryLifetime::DictionaryLifetime(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
{
const auto & lifetime_min_key = config_prefix + ".min";
const auto has_min = config.has(lifetime_min_key);
this->min_sec = has_min ? config.getInt(lifetime_min_key) : config.getInt(config_prefix);
this->max_sec = has_min ? config.getInt(config_prefix + ".max") : this->min_sec;
}
DictionarySpecialAttribute::DictionarySpecialAttribute(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
: name{config.getString(config_prefix + ".name", "")},
expression{config.getString(config_prefix + ".expression", "")}
{
if (name.empty() && !expression.empty())
throw Exception{
"Element " + config_prefix + ".name is empty",
ErrorCodes::BAD_ARGUMENTS};
}
DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
{
const auto has_id = config.has(config_prefix + ".id");
const auto has_key = config.has(config_prefix + ".key");
if (has_key && has_id)
throw Exception{"Only one of 'id' and 'key' should be specified", ErrorCodes::BAD_ARGUMENTS};
if (has_id)
id.emplace(config, config_prefix + ".id");
else if (has_key)
{
key.emplace(getAttributes(config, config_prefix + ".key", false, false));
if (key->empty())
throw Exception{"Empty 'key' supplied", ErrorCodes::BAD_ARGUMENTS};
}
else
throw Exception{"Dictionary structure should specify either 'id' or 'key'", ErrorCodes::BAD_ARGUMENTS};
if (id)
{
if (id->name.empty())
throw Exception{"'id' cannot be empty", ErrorCodes::BAD_ARGUMENTS};
if (config.has(config_prefix + ".range_min"))
range_min.emplace(config, config_prefix + ".range_min");
if (config.has(config_prefix + ".range_max"))
range_max.emplace(config, config_prefix + ".range_max");
if (!id->expression.empty() ||
(range_min && !range_min->expression.empty()) ||
(range_max && !range_max->expression.empty()))
has_expressions = true;
}
attributes = getAttributes(config, config_prefix);
if (attributes.empty())
throw Exception{"Dictionary has no attributes defined", ErrorCodes::BAD_ARGUMENTS};
}
void DictionaryStructure::validateKeyTypes(const DataTypes & key_types) const
{
if (key_types.size() != key.value().size())
throw Exception{
"Key structure does not match, expected " + getKeyDescription(),
ErrorCodes::TYPE_MISMATCH};
for (const auto i : ext::range(0, key_types.size()))
{
const auto & expected_type = (*key)[i].type->getName();
const auto & actual_type = key_types[i]->getName();
if (expected_type != actual_type)
throw Exception{
"Key type at position " + std::to_string(i) + " does not match, expected " + expected_type +
", found " + actual_type,
ErrorCodes::TYPE_MISMATCH};
}
}
std::string DictionaryStructure::getKeyDescription() const
{
if (id)
return "UInt64";
std::ostringstream out;
out << '(';
auto first = true;
for (const auto & key_i : *key)
{
if (!first)
out << ", ";
first = false;
out << key_i.type->getName();
}
out << ')';
return out.str();
}
bool DictionaryStructure::isKeySizeFixed() const
{
if (!key)
return true;
for (const auto key_i : * key)
if (key_i.underlying_type == AttributeUnderlyingType::String)
return false;
return true;
}
std::size_t DictionaryStructure::getKeySize() const
{
return std::accumulate(std::begin(*key), std::end(*key), std::size_t{},
[] (const auto running_size, const auto & key_i) {return running_size + key_i.type->getSizeOfField(); });
}
std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix,
const bool hierarchy_allowed, const bool allow_null_values)
{
Poco::Util::AbstractConfiguration::Keys keys;
config.keys(config_prefix, keys);
auto has_hierarchy = false;
std::vector<DictionaryAttribute> attributes;
for (const auto & key : keys)
{
if (0 != strncmp(key.data(), "attribute", strlen("attribute")))
continue;
const auto prefix = config_prefix + '.' + key + '.';
const auto name = config.getString(prefix + "name");
const auto type_string = config.getString(prefix + "type");
const auto type = DataTypeFactory::instance().get(type_string);
const auto underlying_type = getAttributeUnderlyingType(type_string);
const auto expression = config.getString(prefix + "expression", "");
if (!expression.empty())
has_expressions = true;
Field null_value;
if (allow_null_values)
{
const auto null_value_string = config.getString(prefix + "null_value");
try
{
ReadBufferFromString null_value_buffer{null_value_string};
ColumnPtr column_with_null_value = type->createColumn();
type->deserializeTextEscaped(*column_with_null_value, null_value_buffer);
null_value = (*column_with_null_value)[0];
}
catch (const std::exception & e)
{
throw Exception{
std::string{"Error parsing null_value: "} + e.what(),
ErrorCodes::BAD_ARGUMENTS};
}
}
const auto hierarchical = config.getBool(prefix + "hierarchical", false);
const auto injective = config.getBool(prefix + "injective", false);
if (name.empty())
throw Exception{
"Properties 'name' and 'type' of an attribute cannot be empty",
ErrorCodes::BAD_ARGUMENTS};
if (has_hierarchy && !hierarchy_allowed)
throw Exception{
"Hierarchy not allowed in '" + prefix,
ErrorCodes::BAD_ARGUMENTS};
if (has_hierarchy && hierarchical)
throw Exception{
"Only one hierarchical attribute supported",
ErrorCodes::BAD_ARGUMENTS};
has_hierarchy = has_hierarchy || hierarchical;
attributes.emplace_back(DictionaryAttribute{
name, underlying_type, type, expression, null_value, hierarchical, injective
});
}
return attributes;
}
}

View File

@ -0,0 +1,459 @@
#include <DB/Dictionaries/FlatDictionary.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
extern const int LOGICAL_ERROR;
extern const int UNKNOWN_TYPE;
}
static const auto initial_array_size = 1024;
static const auto max_array_size = 500000;
FlatDictionary::FlatDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, bool require_nonempty)
: name{name}, dict_struct(dict_struct),
source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
FlatDictionary::FlatDictionary(const FlatDictionary & other)
: FlatDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{
}
void FlatDictionary::toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
getItemsNumber<UInt64>(*hierarchical_attribute, ids,
[&] (const std::size_t row, const UInt64 value) { out[row] = value; },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void FlatDictionary::get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const\
{\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItemsNumber<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void FlatDictionary::getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const
{
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
const auto & null_value = StringRef{std::get<String>(attribute.null_values)};
getItemsImpl<StringRef, StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void FlatDictionary::get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\
PaddedPODArray<TYPE> & out) const\
{\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void FlatDictionary::getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def,
ColumnString * const out) const
{
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsImpl<StringRef, StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\
void FlatDictionary::get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE def,\
PaddedPODArray<TYPE> & out) const\
{\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return def; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void FlatDictionary::getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def,
ColumnString * const out) const
{
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
FlatDictionary::getItemsImpl<StringRef, StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return StringRef{def}; });
}
void FlatDictionary::has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const
{
const auto & attribute = attributes.front();
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: has<UInt8>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt16: has<UInt16>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt32: has<UInt32>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt64: has<UInt64>(attribute, ids, out); break;
case AttributeUnderlyingType::Int8: has<Int8>(attribute, ids, out); break;
case AttributeUnderlyingType::Int16: has<Int16>(attribute, ids, out); break;
case AttributeUnderlyingType::Int32: has<Int32>(attribute, ids, out); break;
case AttributeUnderlyingType::Int64: has<Int64>(attribute, ids, out); break;
case AttributeUnderlyingType::Float32: has<Float32>(attribute, ids, out); break;
case AttributeUnderlyingType::Float64: has<Float64>(attribute, ids, out); break;
case AttributeUnderlyingType::String: has<String>(attribute, ids, out); break;
}
}
void FlatDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
{
hierarchical_attribute = &attributes.back();
if (hierarchical_attribute->type != AttributeUnderlyingType::UInt64)
throw Exception{
name + ": hierarchical attribute must be UInt64.",
ErrorCodes::TYPE_MISMATCH};
}
}
}
void FlatDictionary::loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
while (const auto block = stream->read())
{
const auto & id_column = *block.getByPosition(0).column;
element_count += id_column.size();
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *block.getByPosition(attribute_idx + 1).column;
auto & attribute = attributes[attribute_idx];
for (const auto row_idx : ext::range(0, id_column.size()))
setAttributeValue(attribute, id_column[row_idx].get<UInt64>(), attribute_column[row_idx]);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY};
}
template <typename T>
void FlatDictionary::addAttributeSize(const attribute_t & attribute)
{
const auto & array_ref = std::get<ContainerPtrType<T>>(attribute.arrays);
bytes_allocated += sizeof(PaddedPODArray<T>) + array_ref->allocated_size();
bucket_count = array_ref->capacity();
}
void FlatDictionary::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
}
template <typename T>
void FlatDictionary::createAttributeImpl(attribute_t & attribute, const Field & null_value)
{
const auto & null_value_ref = std::get<T>(attribute.null_values) =
null_value.get<typename NearestFieldType<T>::Type>();
std::get<ContainerPtrType<T>>(attribute.arrays) =
std::make_unique<ContainerType<T>>(initial_array_size, null_value_ref);
}
FlatDictionary::attribute_t FlatDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break;
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break;
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break;
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
const auto & null_value_ref = std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ContainerPtrType<StringRef>>(attr.arrays) =
std::make_unique<ContainerType<StringRef>>(initial_array_size, StringRef{null_value_ref});
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr;
}
template <typename OutputType, typename ValueSetter, typename DefaultGetter>
void FlatDictionary::getItemsNumber(
const attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
ValueSetter && set_value,
DefaultGetter && get_default) const
{
if (false) {}
#define DISPATCH(TYPE) \
else if (attribute.type == AttributeUnderlyingType::TYPE) \
getItemsImpl<TYPE, OutputType>(attribute, ids, std::forward<ValueSetter>(set_value), std::forward<DefaultGetter>(get_default));
DISPATCH(UInt8)
DISPATCH(UInt16)
DISPATCH(UInt32)
DISPATCH(UInt64)
DISPATCH(Int8)
DISPATCH(Int16)
DISPATCH(Int32)
DISPATCH(Int64)
DISPATCH(Float32)
DISPATCH(Float64)
#undef DISPATCH
else
throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void FlatDictionary::getItemsImpl(
const attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
ValueSetter && set_value,
DefaultGetter && get_default) const
{
const auto & attr = *std::get<ContainerPtrType<AttributeType>>(attribute.arrays);
const auto rows = ext::size(ids);
using null_value_type = std::conditional_t<std::is_same<AttributeType, StringRef>::value, String, AttributeType>;
const auto null_value = std::get<null_value_type>(attribute.null_values);
for (const auto row : ext::range(0, rows))
{
const auto id = ids[row];
set_value(row, id < ext::size(attr) && attr[id] != null_value ? attr[id] : get_default(row));
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T>
void FlatDictionary::setAttributeValueImpl(attribute_t & attribute, const id_t id, const T value)
{
auto & array = *std::get<ContainerPtrType<T>>(attribute.arrays);
if (id >= array.size())
array.resize_fill(id + 1, std::get<T>(attribute.null_values));
array[id] = value;
}
void FlatDictionary::setAttributeValue(attribute_t & attribute, const id_t id, const Field & value)
{
if (id >= max_array_size)
throw Exception{
name + ": identifier should be less than " + toString(max_array_size),
ErrorCodes::ARGUMENT_OUT_OF_BOUND};
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: setAttributeValueImpl<UInt8>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt16: setAttributeValueImpl<UInt16>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt32: setAttributeValueImpl<UInt32>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt64: setAttributeValueImpl<UInt64>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::Int8: setAttributeValueImpl<Int8>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int16: setAttributeValueImpl<Int16>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int32: setAttributeValueImpl<Int32>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int64: setAttributeValueImpl<Int64>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Float32: setAttributeValueImpl<Float32>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::Float64: setAttributeValueImpl<Float64>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::String:
{
auto & array = *std::get<ContainerPtrType<StringRef>>(attribute.arrays);
if (id >= array.size())
array.resize_fill(id + 1, StringRef{std::get<String>(attribute.null_values)});
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
array[id] = StringRef{string_in_arena, string.size()};
break;
}
}
}
const FlatDictionary::attribute_t & FlatDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
template <typename T>
void FlatDictionary::has(const attribute_t & attribute, const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const
{
using stored_type = std::conditional_t<std::is_same<T, String>::value, StringRef, T>;
const auto & attr = *std::get<ContainerPtrType<stored_type>>(attribute.arrays);
const auto & null_value = std::get<T>(attribute.null_values);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
{
const auto id = ids[i];
out[i] = id < ext::size(attr) && attr[id] != null_value;
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
}

View File

@ -0,0 +1,423 @@
#include <ext/size.hpp>
#include <DB/Dictionaries/HashedDictionary.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
}
HashedDictionary::HashedDictionary(const std::string & name, const DictionaryStructure & dict_struct,
DictionarySourcePtr source_ptr, const DictionaryLifetime dict_lifetime, bool require_nonempty)
: name{name}, dict_struct(dict_struct), source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
HashedDictionary::HashedDictionary(const HashedDictionary & other)
: HashedDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{
}
void HashedDictionary::toParent(const PaddedPODArray<id_t> & ids, PaddedPODArray<id_t> & out) const
{
const auto null_value = std::get<UInt64>(hierarchical_attribute->null_values);
getItemsNumber<UInt64>(*hierarchical_attribute, ids,
[&] (const std::size_t row, const UInt64 value) { out[row] = value; },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void HashedDictionary::get##TYPE(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, PaddedPODArray<TYPE> & out) const\
{\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
const auto null_value = std::get<TYPE>(attribute.null_values);\
\
getItemsNumber<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return null_value; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void HashedDictionary::getString(const std::string & attribute_name, const PaddedPODArray<id_t> & ids, ColumnString * out) const
{
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
const auto & null_value = StringRef{std::get<String>(attribute.null_values)};
getItemsImpl<StringRef, StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return null_value; });
}
#define DECLARE(TYPE)\
void HashedDictionary::get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<TYPE> & def,\
PaddedPODArray<TYPE> & out) const\
{\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t row) { return def[row]; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void HashedDictionary::getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const ColumnString * const def,
ColumnString * const out) const
{
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsImpl<StringRef, StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t row) { return def->getDataAt(row); });
}
#define DECLARE(TYPE)\
void HashedDictionary::get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const TYPE & def, PaddedPODArray<TYPE> & out) const\
{\
const auto & attribute = getAttribute(attribute_name);\
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::TYPE))\
throw Exception{\
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),\
ErrorCodes::TYPE_MISMATCH};\
\
getItemsNumber<TYPE>(attribute, ids,\
[&] (const std::size_t row, const auto value) { out[row] = value; },\
[&] (const std::size_t) { return def; });\
}
DECLARE(UInt8)
DECLARE(UInt16)
DECLARE(UInt32)
DECLARE(UInt64)
DECLARE(Int8)
DECLARE(Int16)
DECLARE(Int32)
DECLARE(Int64)
DECLARE(Float32)
DECLARE(Float64)
#undef DECLARE
void HashedDictionary::getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const String & def,
ColumnString * const out) const
{
const auto & attribute = getAttribute(attribute_name);
if (!isAttributeTypeConvertibleTo(attribute.type, AttributeUnderlyingType::String))
throw Exception{
name + ": type mismatch: attribute " + attribute_name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
getItemsImpl<StringRef, StringRef>(attribute, ids,
[&] (const std::size_t row, const StringRef value) { out->insertData(value.data, value.size); },
[&] (const std::size_t) { return StringRef{def}; });
}
void HashedDictionary::has(const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const
{
const auto & attribute = attributes.front();
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: has<UInt8>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt16: has<UInt16>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt32: has<UInt32>(attribute, ids, out); break;
case AttributeUnderlyingType::UInt64: has<UInt64>(attribute, ids, out); break;
case AttributeUnderlyingType::Int8: has<Int8>(attribute, ids, out); break;
case AttributeUnderlyingType::Int16: has<Int16>(attribute, ids, out); break;
case AttributeUnderlyingType::Int32: has<Int32>(attribute, ids, out); break;
case AttributeUnderlyingType::Int64: has<Int64>(attribute, ids, out); break;
case AttributeUnderlyingType::Float32: has<Float32>(attribute, ids, out); break;
case AttributeUnderlyingType::Float64: has<Float64>(attribute, ids, out); break;
case AttributeUnderlyingType::String: has<StringRef>(attribute, ids, out); break;
}
}
void HashedDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
{
hierarchical_attribute = &attributes.back();
if (hierarchical_attribute->type != AttributeUnderlyingType::UInt64)
throw Exception{
name + ": hierarchical attribute must be UInt64.",
ErrorCodes::TYPE_MISMATCH};
}
}
}
void HashedDictionary::loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
while (const auto block = stream->read())
{
const auto & id_column = *block.getByPosition(0).column;
element_count += id_column.size();
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *block.getByPosition(attribute_idx + 1).column;
auto & attribute = attributes[attribute_idx];
for (const auto row_idx : ext::range(0, id_column.size()))
setAttributeValue(attribute, id_column[row_idx].get<UInt64>(), attribute_column[row_idx]);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY};
}
template <typename T>
void HashedDictionary::addAttributeSize(const attribute_t & attribute)
{
const auto & map_ref = std::get<CollectionPtrType<T>>(attribute.maps);
bytes_allocated += sizeof(CollectionType<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
void HashedDictionary::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
}
template <typename T>
void HashedDictionary::createAttributeImpl(attribute_t & attribute, const Field & null_value)
{
std::get<T>(attribute.null_values) = null_value.get<typename NearestFieldType<T>::Type>();
std::get<CollectionPtrType<T>>(attribute.maps) = std::make_unique<CollectionType<T>>();
}
HashedDictionary::attribute_t HashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break;
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break;
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break;
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<CollectionPtrType<StringRef>>(attr.maps) = std::make_unique<CollectionType<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr;
}
template <typename OutputType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsNumber(
const attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
ValueSetter && set_value,
DefaultGetter && get_default) const
{
if (false) {}
#define DISPATCH(TYPE) \
else if (attribute.type == AttributeUnderlyingType::TYPE) \
getItemsImpl<TYPE, OutputType>(attribute, ids, std::forward<ValueSetter>(set_value), std::forward<DefaultGetter>(get_default));
DISPATCH(UInt8)
DISPATCH(UInt16)
DISPATCH(UInt32)
DISPATCH(UInt64)
DISPATCH(Int8)
DISPATCH(Int16)
DISPATCH(Int32)
DISPATCH(Int64)
DISPATCH(Float32)
DISPATCH(Float64)
#undef DISPATCH
else
throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
}
template <typename AttributeType, typename OutputType, typename ValueSetter, typename DefaultGetter>
void HashedDictionary::getItemsImpl(
const attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
ValueSetter && set_value,
DefaultGetter && get_default) const
{
const auto & attr = *std::get<CollectionPtrType<AttributeType>>(attribute.maps);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
{
const auto it = attr.find(ids[i]);
set_value(i, it != attr.end() ? it->second : get_default(i));
}
query_count.fetch_add(rows, std::memory_order_relaxed);
}
template <typename T>
void HashedDictionary::setAttributeValueImpl(attribute_t & attribute, const id_t id, const T value)
{
auto & map = *std::get<CollectionPtrType<T>>(attribute.maps);
map.insert({ id, value });
}
void HashedDictionary::setAttributeValue(attribute_t & attribute, const id_t id, const Field & value)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: setAttributeValueImpl<UInt8>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt16: setAttributeValueImpl<UInt16>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt32: setAttributeValueImpl<UInt32>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt64: setAttributeValueImpl<UInt64>(attribute, id, value.get<UInt64>()); break;
case AttributeUnderlyingType::Int8: setAttributeValueImpl<Int8>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int16: setAttributeValueImpl<Int16>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int32: setAttributeValueImpl<Int32>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Int64: setAttributeValueImpl<Int64>(attribute, id, value.get<Int64>()); break;
case AttributeUnderlyingType::Float32: setAttributeValueImpl<Float32>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::Float64: setAttributeValueImpl<Float64>(attribute, id, value.get<Float64>()); break;
case AttributeUnderlyingType::String:
{
auto & map = *std::get<CollectionPtrType<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
map.insert({ id, StringRef{string_in_arena, string.size()} });
break;
}
}
}
const HashedDictionary::attribute_t & HashedDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
template <typename T>
void HashedDictionary::has(const attribute_t & attribute, const PaddedPODArray<id_t> & ids, PaddedPODArray<UInt8> & out) const
{
const auto & attr = *std::get<CollectionPtrType<T>>(attribute.maps);
const auto rows = ext::size(ids);
for (const auto i : ext::range(0, rows))
out[i] = attr.find(ids[i]) != std::end(attr);
query_count.fetch_add(rows, std::memory_order_relaxed);
}
}

View File

@ -1,3 +1,158 @@
#include "DB/Dictionaries/MySQLDictionarySource.h" #include <DB/Dictionaries/MySQLDictionarySource.h>
decltype(DB::MySQLDictionarySource::max_block_size) DB::MySQLDictionarySource::max_block_size;
namespace DB
{
decltype(MySQLDictionarySource::max_block_size) MySQLDictionarySource::max_block_size;
MySQLDictionarySource::MySQLDictionarySource(const DictionaryStructure & dict_struct_,
const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix,
const Block & sample_block)
: dict_struct{dict_struct_},
db{config.getString(config_prefix + ".db", "")},
table{config.getString(config_prefix + ".table")},
where{config.getString(config_prefix + ".where", "")},
dont_check_update_time{config.getBool(config_prefix + ".dont_check_update_time", false)},
sample_block{sample_block},
pool{config, config_prefix},
query_builder{dict_struct, db, table, where},
load_all_query{query_builder.composeLoadAllQuery()}
{
}
/// copy-constructor is provided in order to support cloneability
MySQLDictionarySource::MySQLDictionarySource(const MySQLDictionarySource & other)
: dict_struct{other.dict_struct},
db{other.db},
table{other.table},
where{other.where},
dont_check_update_time{other.dont_check_update_time},
sample_block{other.sample_block},
pool{other.pool},
query_builder{dict_struct, db, table, where},
load_all_query{other.load_all_query}, last_modification{other.last_modification}
{
}
BlockInputStreamPtr MySQLDictionarySource::loadAll()
{
last_modification = getLastModification();
LOG_TRACE(log, load_all_query);
return std::make_shared<MySQLBlockInputStream>(pool.Get(), load_all_query, sample_block, max_block_size);
}
BlockInputStreamPtr MySQLDictionarySource::loadIds(const std::vector<std::uint64_t> & ids)
{
/// Здесь не логгируем и не обновляем время модификации, так как запрос может быть большим, и часто задаваться.
const auto query = query_builder.composeLoadIdsQuery(ids);
return std::make_shared<MySQLBlockInputStream>(pool.Get(), query, sample_block, max_block_size);
}
BlockInputStreamPtr MySQLDictionarySource::loadKeys(
const ConstColumnPlainPtrs & key_columns, const std::vector<std::size_t> & requested_rows)
{
/// Здесь не логгируем и не обновляем время модификации, так как запрос может быть большим, и часто задаваться.
const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN);
return std::make_shared<MySQLBlockInputStream>(pool.Get(), query, sample_block, max_block_size);
}
bool MySQLDictionarySource::isModified() const
{
if (dont_check_update_time)
return true;
return getLastModification() > last_modification;
}
bool MySQLDictionarySource::supportsSelectiveLoad() const
{
return true;
}
DictionarySourcePtr MySQLDictionarySource::clone() const
{
return std::make_unique<MySQLDictionarySource>(*this);
}
std::string MySQLDictionarySource::toString() const
{
return "MySQL: " + db + '.' + table + (where.empty() ? "" : ", where: " + where);
}
std::string MySQLDictionarySource::quoteForLike(const std::string s)
{
std::string tmp;
tmp.reserve(s.size());
for (auto c : s)
{
if (c == '%' || c == '_' || c == '\\')
tmp.push_back('\\');
tmp.push_back(c);
}
std::string res;
{
WriteBufferFromString out(res);
writeQuoted(tmp, out);
}
return res;
}
LocalDateTime MySQLDictionarySource::getLastModification() const
{
LocalDateTime update_time{std::time(nullptr)};
if (dont_check_update_time)
return update_time;
try
{
auto connection = pool.Get();
auto query = connection->query("SHOW TABLE STATUS LIKE " + quoteForLike(table));
LOG_TRACE(log, query.str());
auto result = query.use();
size_t fetched_rows = 0;
if (auto row = result.fetch())
{
++fetched_rows;
const auto UPDATE_TIME_IDX = 12;
const auto & update_time_value = row[UPDATE_TIME_IDX];
if (!update_time_value.isNull())
{
update_time = update_time_value.getDateTime();
LOG_TRACE(log, "Got update time: " << update_time);
}
/// fetch remaining rows to avoid "commands out of sync" error
while (result.fetch())
++fetched_rows;
}
if (0 == fetched_rows)
LOG_ERROR(log, "Cannot find table in SHOW TABLE STATUS result.");
if (fetched_rows > 1)
LOG_ERROR(log, "Found more than one table in SHOW TABLE STATUS result.");
}
catch (...)
{
tryLogCurrentException("MySQLDictionarySource");
}
/// we suppose failure to get modification time is not an error, therefore return current time
return update_time;
}
}

View File

@ -1,3 +1,80 @@
#include "DB/Dictionaries/ODBCDictionarySource.h" #include <DB/Dictionaries/ODBCDictionarySource.h>
decltype(DB::ODBCDictionarySource::max_block_size) DB::ODBCDictionarySource::max_block_size;
namespace DB
{
decltype(ODBCDictionarySource::max_block_size) ODBCDictionarySource::max_block_size;
ODBCDictionarySource::ODBCDictionarySource(const DictionaryStructure & dict_struct_,
const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix,
const Block & sample_block)
: dict_struct{dict_struct_},
db{config.getString(config_prefix + ".db", "")},
table{config.getString(config_prefix + ".table")},
where{config.getString(config_prefix + ".where", "")},
sample_block{sample_block},
pool{std::make_shared<Poco::Data::SessionPool>(
config.getString(config_prefix + ".connector", "ODBC"),
config.getString(config_prefix + ".connection_string"))},
query_builder{dict_struct, db, table, where},
load_all_query{query_builder.composeLoadAllQuery()}
{
}
/// copy-constructor is provided in order to support cloneability
ODBCDictionarySource::ODBCDictionarySource(const ODBCDictionarySource & other)
: dict_struct{other.dict_struct},
db{other.db},
table{other.table},
where{other.where},
sample_block{other.sample_block},
pool{other.pool},
query_builder{dict_struct, db, table, where},
load_all_query{other.load_all_query}
{
}
BlockInputStreamPtr ODBCDictionarySource::loadAll()
{
LOG_TRACE(log, load_all_query);
return std::make_shared<ODBCBlockInputStream>(pool->get(), load_all_query, sample_block, max_block_size);
}
BlockInputStreamPtr ODBCDictionarySource::loadIds(const std::vector<std::uint64_t> & ids)
{
const auto query = query_builder.composeLoadIdsQuery(ids);
return std::make_shared<ODBCBlockInputStream>(pool->get(), query, sample_block, max_block_size);
}
BlockInputStreamPtr ODBCDictionarySource::loadKeys(
const ConstColumnPlainPtrs & key_columns, const std::vector<std::size_t> & requested_rows)
{
const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN);
return std::make_shared<ODBCBlockInputStream>(pool->get(), query, sample_block, max_block_size);
}
bool ODBCDictionarySource::isModified() const
{
return true;
}
bool ODBCDictionarySource::supportsSelectiveLoad() const
{
return true;
}
DictionarySourcePtr ODBCDictionarySource::clone() const
{
return std::make_unique<ODBCDictionarySource>(*this);
}
std::string ODBCDictionarySource::toString() const
{
return "ODBC: " + db + '.' + table + (where.empty() ? "" : ", where: " + where);
}
}

View File

@ -0,0 +1,355 @@
#include <DB/Dictionaries/RangeHashedDictionary.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
}
RangeHashedDictionary::RangeHashedDictionary(
const std::string & name, const DictionaryStructure & dict_struct, DictionarySourcePtr source_ptr,
const DictionaryLifetime dict_lifetime, bool require_nonempty)
: name{name}, dict_struct(dict_struct),
source_ptr{std::move(source_ptr)}, dict_lifetime(dict_lifetime),
require_nonempty(require_nonempty)
{
createAttributes();
try
{
loadData();
calculateBytesAllocated();
}
catch (...)
{
creation_exception = std::current_exception();
}
creation_time = std::chrono::system_clock::now();
}
RangeHashedDictionary::RangeHashedDictionary(const RangeHashedDictionary & other)
: RangeHashedDictionary{other.name, other.dict_struct, other.source_ptr->clone(), other.dict_lifetime, other.require_nonempty}
{
}
#define DECLARE_MULTIPLE_GETTER(TYPE)\
void RangeHashedDictionary::get##TYPE(\
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<UInt16> & dates,\
PaddedPODArray<TYPE> & out) const\
{\
const auto & attribute = getAttributeWithType(attribute_name, AttributeUnderlyingType::TYPE);\
getItems<TYPE>(attribute, ids, dates, out);\
}
DECLARE_MULTIPLE_GETTER(UInt8)
DECLARE_MULTIPLE_GETTER(UInt16)
DECLARE_MULTIPLE_GETTER(UInt32)
DECLARE_MULTIPLE_GETTER(UInt64)
DECLARE_MULTIPLE_GETTER(Int8)
DECLARE_MULTIPLE_GETTER(Int16)
DECLARE_MULTIPLE_GETTER(Int32)
DECLARE_MULTIPLE_GETTER(Int64)
DECLARE_MULTIPLE_GETTER(Float32)
DECLARE_MULTIPLE_GETTER(Float64)
#undef DECLARE_MULTIPLE_GETTER
void RangeHashedDictionary::getString(
const std::string & attribute_name, const PaddedPODArray<id_t> & ids, const PaddedPODArray<UInt16> & dates,
ColumnString * out) const
{
const auto & attribute = getAttributeWithType(attribute_name, AttributeUnderlyingType::String);
const auto & attr = *std::get<ptr_t<StringRef>>(attribute.maps);
const auto & null_value = std::get<String>(attribute.null_values);
for (const auto i : ext::range(0, ids.size()))
{
const auto it = attr.find(ids[i]);
if (it != std::end(attr))
{
const auto date = dates[i];
const auto & ranges_and_values = it->second;
const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values),
[date] (const value_t<StringRef> & v) { return v.range.contains(date); });
const auto string_ref = val_it != std::end(ranges_and_values) ? val_it->value : StringRef{null_value};
out->insertData(string_ref.data, string_ref.size);
}
else
out->insertData(null_value.data(), null_value.size());
}
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
}
void RangeHashedDictionary::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
throw Exception{
name + ": hierarchical attributes not supported by " + getName() + " dictionary.",
ErrorCodes::BAD_ARGUMENTS};
}
}
void RangeHashedDictionary::loadData()
{
auto stream = source_ptr->loadAll();
stream->readPrefix();
while (const auto block = stream->read())
{
const auto & id_column = *block.getByPosition(0).column;
const auto & min_range_column = *block.getByPosition(1).column;
const auto & max_range_column = *block.getByPosition(2).column;
element_count += id_column.size();
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *block.getByPosition(attribute_idx + 3).column;
auto & attribute = attributes[attribute_idx];
for (const auto row_idx : ext::range(0, id_column.size()))
setAttributeValue(attribute, id_column[row_idx].get<UInt64>(),
range_t(min_range_column[row_idx].get<UInt64>(), max_range_column[row_idx].get<UInt64>()),
attribute_column[row_idx]);
}
}
stream->readSuffix();
if (require_nonempty && 0 == element_count)
throw Exception{
name + ": dictionary source is empty and 'require_nonempty' property is set.",
ErrorCodes::DICTIONARY_IS_EMPTY};
}
template <typename T>
void RangeHashedDictionary::addAttributeSize(const attribute_t & attribute)
{
const auto & map_ref = std::get<ptr_t<T>>(attribute.maps);
bytes_allocated += sizeof(collection_t<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
void RangeHashedDictionary::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
for (const auto & attribute : attributes)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: addAttributeSize<UInt8>(attribute); break;
case AttributeUnderlyingType::UInt16: addAttributeSize<UInt16>(attribute); break;
case AttributeUnderlyingType::UInt32: addAttributeSize<UInt32>(attribute); break;
case AttributeUnderlyingType::UInt64: addAttributeSize<UInt64>(attribute); break;
case AttributeUnderlyingType::Int8: addAttributeSize<Int8>(attribute); break;
case AttributeUnderlyingType::Int16: addAttributeSize<Int16>(attribute); break;
case AttributeUnderlyingType::Int32: addAttributeSize<Int32>(attribute); break;
case AttributeUnderlyingType::Int64: addAttributeSize<Int64>(attribute); break;
case AttributeUnderlyingType::Float32: addAttributeSize<Float32>(attribute); break;
case AttributeUnderlyingType::Float64: addAttributeSize<Float64>(attribute); break;
case AttributeUnderlyingType::String:
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
break;
}
}
}
}
template <typename T>
void RangeHashedDictionary::createAttributeImpl(attribute_t & attribute, const Field & null_value)
{
std::get<T>(attribute.null_values) = null_value.get<typename NearestFieldType<T>::Type>();
std::get<ptr_t<T>>(attribute.maps) = std::make_unique<collection_t<T>>();
}
RangeHashedDictionary::attribute_t RangeHashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
attribute_t attr{type};
switch (type)
{
case AttributeUnderlyingType::UInt8: createAttributeImpl<UInt8>(attr, null_value); break;
case AttributeUnderlyingType::UInt16: createAttributeImpl<UInt16>(attr, null_value); break;
case AttributeUnderlyingType::UInt32: createAttributeImpl<UInt32>(attr, null_value); break;
case AttributeUnderlyingType::UInt64: createAttributeImpl<UInt64>(attr, null_value); break;
case AttributeUnderlyingType::Int8: createAttributeImpl<Int8>(attr, null_value); break;
case AttributeUnderlyingType::Int16: createAttributeImpl<Int16>(attr, null_value); break;
case AttributeUnderlyingType::Int32: createAttributeImpl<Int32>(attr, null_value); break;
case AttributeUnderlyingType::Int64: createAttributeImpl<Int64>(attr, null_value); break;
case AttributeUnderlyingType::Float32: createAttributeImpl<Float32>(attr, null_value); break;
case AttributeUnderlyingType::Float64: createAttributeImpl<Float64>(attr, null_value); break;
case AttributeUnderlyingType::String:
{
std::get<String>(attr.null_values) = null_value.get<String>();
std::get<ptr_t<StringRef>>(attr.maps) = std::make_unique<collection_t<StringRef>>();
attr.string_arena = std::make_unique<Arena>();
break;
}
}
return attr;
}
template <typename OutputType>
void RangeHashedDictionary::getItems(
const attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
const PaddedPODArray<UInt16> & dates,
PaddedPODArray<OutputType> & out) const
{
if (false) {}
#define DISPATCH(TYPE) \
else if (attribute.type == AttributeUnderlyingType::TYPE) \
getItemsImpl<TYPE, OutputType>(attribute, ids, dates, out);
DISPATCH(UInt8)
DISPATCH(UInt16)
DISPATCH(UInt32)
DISPATCH(UInt64)
DISPATCH(Int8)
DISPATCH(Int16)
DISPATCH(Int32)
DISPATCH(Int64)
DISPATCH(Float32)
DISPATCH(Float64)
#undef DISPATCH
else
throw Exception("Unexpected type of attribute: " + toString(attribute.type), ErrorCodes::LOGICAL_ERROR);
}
template <typename AttributeType, typename OutputType>
void RangeHashedDictionary::getItemsImpl(
const attribute_t & attribute,
const PaddedPODArray<id_t> & ids,
const PaddedPODArray<UInt16> & dates,
PaddedPODArray<OutputType> & out) const
{
const auto & attr = *std::get<ptr_t<AttributeType>>(attribute.maps);
const auto null_value = std::get<AttributeType>(attribute.null_values);
for (const auto i : ext::range(0, ids.size()))
{
const auto it = attr.find(ids[i]);
if (it != std::end(attr))
{
const auto date = dates[i];
const auto & ranges_and_values = it->second;
const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values),
[date] (const value_t<AttributeType> & v) { return v.range.contains(date); });
out[i] = val_it != std::end(ranges_and_values) ? val_it->value : null_value;
}
else
out[i] = null_value;
}
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
}
template <typename T>
void RangeHashedDictionary::setAttributeValueImpl(attribute_t & attribute, const id_t id, const range_t & range, const T value)
{
auto & map = *std::get<ptr_t<T>>(attribute.maps);
const auto it = map.find(id);
if (it != map.end())
{
auto & values = it->second;
const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range,
[] (const value_t<T> & lhs, const range_t & range) {
return lhs.range < range;
});
values.insert(insert_it, value_t<T>{ range, value });
}
else
map.insert({ id, values_t<T>{ value_t<T>{ range, value } } });
}
void RangeHashedDictionary::setAttributeValue(attribute_t & attribute, const id_t id, const range_t & range, const Field & value)
{
switch (attribute.type)
{
case AttributeUnderlyingType::UInt8: setAttributeValueImpl<UInt8>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt16: setAttributeValueImpl<UInt16>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt32: setAttributeValueImpl<UInt32>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::UInt64: setAttributeValueImpl<UInt64>(attribute, id, range, value.get<UInt64>()); break;
case AttributeUnderlyingType::Int8: setAttributeValueImpl<Int8>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Int16: setAttributeValueImpl<Int16>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Int32: setAttributeValueImpl<Int32>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Int64: setAttributeValueImpl<Int64>(attribute, id, range, value.get<Int64>()); break;
case AttributeUnderlyingType::Float32: setAttributeValueImpl<Float32>(attribute, id, range, value.get<Float64>()); break;
case AttributeUnderlyingType::Float64: setAttributeValueImpl<Float64>(attribute, id, range, value.get<Float64>()); break;
case AttributeUnderlyingType::String:
{
auto & map = *std::get<ptr_t<StringRef>>(attribute.maps);
const auto & string = value.get<String>();
const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size());
const StringRef string_ref{string_in_arena, string.size()};
const auto it = map.find(id);
if (it != map.end())
{
auto & values = it->second;
const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range,
[] (const value_t<StringRef> & lhs, const range_t & range) {
return lhs.range < range;
});
values.insert(insert_it, value_t<StringRef>{ range, string_ref });
}
else
map.insert({ id, values_t<StringRef>{ value_t<StringRef>{ range, string_ref } } });
break;
}
}
}
const RangeHashedDictionary::attribute_t & RangeHashedDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{
name + ": no such attribute '" + attribute_name + "'",
ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
const RangeHashedDictionary::attribute_t & RangeHashedDictionary::getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const
{
const auto & attribute = getAttribute(name);
if (attribute.type != type)
throw Exception{
name + ": type mismatch: attribute " + name + " has type " + toString(attribute.type),
ErrorCodes::TYPE_MISMATCH};
return attribute;
}
}

View File

@ -247,11 +247,6 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
if (table) if (table)
{ {
s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << "FROM " << (s.hilite ? hilite_none : ""); s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << "FROM " << (s.hilite ? hilite_none : "");
if (database)
{
database->formatImpl(s, state, frame);
s.ostr << ".";
}
if (typeid_cast<const ASTSelectQuery *>(&*table)) if (typeid_cast<const ASTSelectQuery *>(&*table))
{ {
@ -270,8 +265,16 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
s.ostr << "\n" << indent_str << ")"; s.ostr << "\n" << indent_str << ")";
} }
else else
{
if (database)
{
database->formatImpl(s, state, frame);
s.ostr << ".";
}
table->formatImpl(s, state, frame); table->formatImpl(s, state, frame);
} }
}
if (final) if (final)
{ {

View File

@ -379,6 +379,26 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
} }
/** Является ли директория куска старой.
* Это так, если её дата модификации,
* и одновременно дата модификации всех файлов внутри неё
* (рассматриваются файлы только на одном уровне вложенности),
* меньше threshold.
*/
static bool isOldPartDirectory(Poco::File & directory, time_t threshold)
{
if (directory.getLastModified().epochTime() >= threshold)
return false;
Poco::DirectoryIterator end;
for (Poco::DirectoryIterator it(directory); it != end; ++it)
if (it->getLastModified().epochTime() >= threshold)
return false;
return true;
}
void MergeTreeData::clearOldTemporaryDirectories() void MergeTreeData::clearOldTemporaryDirectories()
{ {
/// Если метод уже вызван из другого потока, то можно ничего не делать. /// Если метод уже вызван из другого потока, то можно ничего не делать.
@ -386,6 +406,8 @@ void MergeTreeData::clearOldTemporaryDirectories()
if (!lock.try_lock()) if (!lock.try_lock())
return; return;
time_t current_time = time(0);
/// Удаляем временные директории старше суток. /// Удаляем временные директории старше суток.
Poco::DirectoryIterator end; Poco::DirectoryIterator end;
for (Poco::DirectoryIterator it{full_path}; it != end; ++it) for (Poco::DirectoryIterator it{full_path}; it != end; ++it)
@ -396,7 +418,7 @@ void MergeTreeData::clearOldTemporaryDirectories()
try try
{ {
if (tmp_dir.isDirectory() && tmp_dir.getLastModified().epochTime() + 86400 < time(0)) if (tmp_dir.isDirectory() && isOldPartDirectory(tmp_dir, current_time - settings.temporary_directories_lifetime))
{ {
LOG_WARNING(log, "Removing temporary directory " << full_path << it.name()); LOG_WARNING(log, "Removing temporary directory " << full_path << it.name());
Poco::File(full_path + it.name()).remove(true); Poco::File(full_path + it.name()).remove(true);
@ -652,7 +674,16 @@ MergeTreeData::AlterDataPartTransactionPtr MergeTreeData::alterDataPart(
bool force_update_metadata; bool force_update_metadata;
createConvertExpression(part, part->columns, new_columns, expression, transaction->rename_map, force_update_metadata); createConvertExpression(part, part->columns, new_columns, expression, transaction->rename_map, force_update_metadata);
if (!skip_sanity_checks && transaction->rename_map.size() > settings.max_files_to_modify_in_alter_columns) size_t num_files_to_modify = transaction->rename_map.size();
size_t num_files_to_remove = 0;
for (const auto & from_to : transaction->rename_map)
if (from_to.second.empty())
++num_files_to_remove;
if (!skip_sanity_checks
&& (num_files_to_modify > settings.max_files_to_modify_in_alter_columns
|| num_files_to_remove > settings.max_files_to_remove_in_alter_columns))
{ {
transaction->clear(); transaction->clear();

View File

@ -130,8 +130,7 @@ BlockInputStreams StorageMergeTree::read(
auto & select = typeid_cast<const ASTSelectQuery &>(*query); auto & select = typeid_cast<const ASTSelectQuery &>(*query);
/// Try transferring some condition from WHERE to PREWHERE if enabled and viable /// Try transferring some condition from WHERE to PREWHERE if enabled and viable
if (settings.optimize_move_to_prewhere) if (settings.optimize_move_to_prewhere && select.where_expression && !select.prewhere_expression && !select.final)
if (select.where_expression && !select.prewhere_expression)
MergeTreeWhereOptimizer{query, context, data, column_names, log}; MergeTreeWhereOptimizer{query, context, data, column_names, log};
return reader.read(column_names, query, context, settings, processed_stage, max_block_size, threads, nullptr, 0); return reader.read(column_names, query, context, settings, processed_stage, max_block_size, threads, nullptr, 0);

View File

@ -2089,8 +2089,7 @@ BlockInputStreams StorageReplicatedMergeTree::read(
auto & select = typeid_cast<const ASTSelectQuery &>(*query); auto & select = typeid_cast<const ASTSelectQuery &>(*query);
/// Try transferring some condition from WHERE to PREWHERE if enabled and viable /// Try transferring some condition from WHERE to PREWHERE if enabled and viable
if (settings.optimize_move_to_prewhere) if (settings.optimize_move_to_prewhere && select.where_expression && !select.prewhere_expression && !select.final)
if (select.where_expression && !select.prewhere_expression)
MergeTreeWhereOptimizer{query, context, data, real_column_names, log}; MergeTreeWhereOptimizer{query, context, data, real_column_names, log};
Block virtual_columns_block; Block virtual_columns_block;

View File

@ -56,23 +56,38 @@ StorageView::StorageView(
inner_query = select; inner_query = select;
if (inner_query.database) extractDependentTable(inner_query);
select_database_name = typeid_cast<const ASTIdentifier &>(*inner_query.database).name;
else if (!select_table_name.empty())
context.getGlobalContext().addDependency(
DatabaseAndTableName(select_database_name, select_table_name),
DatabaseAndTableName(database_name, table_name));
}
void StorageView::extractDependentTable(const ASTSelectQuery & query)
{
if (!query.table)
return;
if (const ASTIdentifier * ast_id = typeid_cast<const ASTIdentifier *>(query.table.get()))
{
if (!query.database)
throw Exception("Logical error while creating StorageView." throw Exception("Logical error while creating StorageView."
" Could not retrieve database name from select query.", " Could not retrieve database name from select query.",
DB::ErrorCodes::LOGICAL_ERROR); DB::ErrorCodes::LOGICAL_ERROR);
if (inner_query.table) select_database_name = typeid_cast<const ASTIdentifier &>(*query.database).name;
select_table_name = typeid_cast<const ASTIdentifier &>(*inner_query.table).name; select_table_name = ast_id->name;
}
else if (const ASTSelectQuery * ast_select = typeid_cast<const ASTSelectQuery *>(query.table.get()))
{
extractDependentTable(*ast_select);
}
else else
throw Exception("Logical error while creating StorageView." throw Exception("Logical error while creating StorageView."
" Could not retrieve table name from select query.", " Could not retrieve table name from select query.",
DB::ErrorCodes::LOGICAL_ERROR); DB::ErrorCodes::LOGICAL_ERROR);
context.getGlobalContext().addDependency(
DatabaseAndTableName(select_database_name, select_table_name),
DatabaseAndTableName(database_name, table_name));
} }
@ -110,6 +125,7 @@ BlockInputStreams StorageView::read(
void StorageView::drop() void StorageView::drop()
{ {
if (!select_table_name.empty())
context.getGlobalContext().removeDependency( context.getGlobalContext().removeDependency(
DatabaseAndTableName(select_database_name, select_table_name), DatabaseAndTableName(select_database_name, select_table_name),
DatabaseAndTableName(database_name, table_name)); DatabaseAndTableName(database_name, table_name));

View File

@ -0,0 +1,11 @@
1
0
1
4
9
16
25
36
49
64
81

View File

@ -0,0 +1,11 @@
DROP TABLE IF EXISTS test.v1;
DROP TABLE IF EXISTS test.v2;
CREATE VIEW test.v1 AS SELECT 1 FROM (SELECT 1);
SELECT * FROM test.v1;
CREATE VIEW test.v2 AS SELECT number * number FROM (SELECT number FROM system.numbers LIMIT 10);
SELECT * FROM test.v2;
DROP TABLE test.v1;
DROP TABLE test.v2;

View File

@ -0,0 +1,5 @@
2016-06-02 1 version 0 0
2016-06-02 1 version 1 1
2016-06-02 2 version 1 1
2016-06-02 1 version 1 1
2016-06-02 2 version 1 1

View File

@ -0,0 +1,12 @@
DROP TABLE IF EXISTS test.replace;
CREATE TABLE test.replace ( EventDate Date, Id UInt64, Data String, Version UInt32) ENGINE = ReplacingMergeTree(EventDate, Id, 8192, Version);
INSERT INTO test.replace VALUES ('2016-06-02', 1, 'version 1', 1);
INSERT INTO test.replace VALUES ('2016-06-02', 2, 'version 1', 1);
INSERT INTO test.replace VALUES ('2016-06-02', 1, 'version 0', 0);
SELECT * FROM test.replace ORDER BY Id, Version;
SELECT * FROM test.replace FINAL ORDER BY Id, Version;
SELECT * FROM test.replace FINAL WHERE Version = 0 ORDER BY Id, Version;
DROP TABLE test.replace;

View File

@ -1,18 +1,12 @@
Сегодня внутренняя разработка компании Яндекс — <a href="https://clickhouse.yandex/">аналитическая СУБД ClickHouse</a>, стала доступна каждому. Исходники опубликованы на <a href="https://github.com/yandex/ClickHouse">GitHub</a> под лицензией Apache 2.0. Давайте я расскажу, зачем мы решили это сделать. Сегодня внутренняя разработка компании Яндекс — <a href="https://clickhouse.yandex/">аналитическая СУБД ClickHouse</a>, стала доступна каждому. Исходники опубликованы на <a href="https://github.com/yandex/ClickHouse">GitHub</a> под лицензией Apache 2.0.
<img src="https://habrastorage.org/files/d9b/066/e61/d9b066e61e1f480a977d889dc03ded99.png"/> <img src="https://habrastorage.org/files/d9b/066/e61/d9b066e61e1f480a977d889dc03ded99.png"/>
Изначально мы разрабатывали ClickHouse исключительно для задач <a href="https://metrika.yandex.ru/">Яндекс.Метрики</a> — для того, чтобы строить отчёты в интерактивном режиме по неагрегированным логам пользовательских действий. В связи с тем, что система является полноценной СУБД и обладает весьма широкой функциональностью, уже в начале использования в 2012 году, была написана <a href="https://clickhouse.yandex/reference_ru.html">подробная документация</a>. Это отличает ClickHouse от многих типичных внутренних разработок — специализированных и встраеваемых структур данных для решения конкретных задач, таких как, например, Metrage и OLAPServer, о которых я рассказывал в <a href="http://habrahabr.ru/company/yandex/blog/273305/">предыдущей статье</a>. ClickHouse позволяет выполнять аналитические запросы в интерактивном режиме по данным, обновляемым в реальном времени. Система способна масштабироваться до десятков триллионов записей и петабайт хранимых данных. Использование ClickHouse открывает возможности, которые было трудно представить раньше: вы можете сохранять весь поток данных без предварительной агрегации и быстро получать отчёты в любых разрезах. ClickHouse разработан в Яндексе для задач <a href="https://metrika.yandex.ru/">Яндекс.Метрики</a> — второй по величине системе веб-аналитики в мире.
<cut text="Почему мы выкладываем ClickHouse?" /> В этой статье мы расскажем, как и для чего ClickHouse появился в Яндексе, что он умеет, сравним его с другими системами и покажем, как его поднять у себя с минимальными усилиями.
Это привело к тому, что ClickHouse постепенно распространился по многим отделам Яндекса. Неожиданно оказалось, что система может быть установлена по инструкции и работает "из коробки", без необходимости привлечения разработчиков. ClickHouse стал использоваться в Директе, Маркете, Почте, AdFox, Вебмастере, в мониторингах и в бизнес аналитике. Каждый раз ClickHouse позволял решить некоторую задачу, для которой раньше не было подходящих инструментов, либо решить задачу на порядки более эффективно. <cut />
Постепенно возник спрос на использование ClickHouse не только во внутренних продуктах Яндекса. Например, в 2013 году, ClickHouse применялся для анализа метаданных о событиях <a href="https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/">эксперимента LHCb в CERN</a>. Система могла бы использоваться более широко, но в то время этому мешал закрытый статус. Другой пример: open-source технология <a href="https://tech.yandex.ru/tank/">Яндекс.Танк</a> внутри Яндекса использует ClickHouse для хранения данных телеметрии, тогда как для внешних пользователей, в качестве базы данных был доступен только MySQL, который плохо подходит для данной задачи.
По мере расширения пользовательской базы, возникла необходимость тратить на разработку чуть больше усилий, хоть и незначительно по сравнению с задачами Метрики. Зато в качестве отдачи, мы получаем повышение качества продукта, особенно в плане юзабилити.
Расширение пользовательской базы позволяет рассматривать примеры использования, о которых другим способом было бы трудно догадаться. Также это позволяет раньше находить баги и неудобства, которые имеют значение в том числе и для основного применения ClickHouse в Метрике. Без сомнения, это делает продукт качественнее.
<b>Давайте рассмотрим, где находится ниша ClickHouse.</b> Зачем кому-то может понадобиться использовать ClickHouse, когда есть много других технологий для работы с большими данными? <b>Давайте рассмотрим, где находится ниша ClickHouse.</b> Зачем кому-то может понадобиться использовать ClickHouse, когда есть много других технологий для работы с большими данными?
@ -43,19 +37,30 @@
</li> </li>
<li>Open-source OLAP СУБД. Пример: <a href="https://github.com/infinidb/infinidb">InfiniDB</a>, <a href="https://www.monetdb.org/">MonetDB</a>, <a href="https://github.com/LucidDB/luciddb">LucidDB</a>. <li>Open-source OLAP СУБД. Пример: <a href="https://github.com/infinidb/infinidb">InfiniDB</a>, <a href="https://www.monetdb.org/">MonetDB</a>, <a href="https://github.com/LucidDB/luciddb">LucidDB</a>.
Разработка всех этих проектов заброшена, они никогда не были достаточно зрелыми и, по сути, так и не вышли из альфа-версии. Эти системы не были распределёнными, что является критически необходимым для обработки больших данных. Активная разработка ClickHouse, зрелость технологии и ориентация на практические потребности, возникающие при обработке больших данных, обеспечечивается задачами Яндекса. Без использования «в бою» на реальных задачах, выходящих за рамки возможностей существующих систем, создать качественный продукт было бы невозможно. Разработка всех этих проектов заброшена, они никогда не были достаточно зрелыми и, по сути, так и не вышли из альфа-версии. Эти системы не были распределёнными, что является критически необходимым для обработки больших данных. Активная разработка ClickHouse, зрелость технологии и ориентация на практические потребности, возникающие при обработке больших данных, обеспечивается задачами Яндекса. Без использования «в бою» на реальных задачах, выходящих за рамки возможностей существующих систем, создать качественный продукт было бы невозможно.
</li> </li>
<li>Open-source системы для аналитики, не являющиеся Relational OLAP СУБД. <li>Open-source системы для аналитики, не являющиеся Relational OLAP СУБД.
Пример: <a href="http://druid.io/">Metamarkets Druid</a>, <a href="http://kylin.apache.org/">Apache Kylin</a>. Пример: <a href="http://druid.io/">Metamarkets Druid</a>, <a href="http://kylin.apache.org/">Apache Kylin</a>.
Нашё отличия: ClickHouse не требует предагрегации данных. ClickHouse поддерживает диалект языка SQL и предоставляет удобство реляционных СУБД. Наши отличия: ClickHouse не требует предагрегации данных. ClickHouse поддерживает диалект языка SQL и предоставляет удобство реляционных СУБД.
</li> </li>
</ol> </ol>
В рамках своей достаточно узкой ниши, у ClickHouse до сих пор нет альтернатив. В рамках более широкой области применения, ClickHouse может оказаться выгоднее других систем с точки зрения <a href="https://clickhouse.yandex/benchmark.html">скорости обработки запросов</a>, эффективности использования ресурсов и простоты эксплуатации. В рамках своей достаточно узкой ниши, у ClickHouse до сих пор нет альтернатив. В рамках более широкой области применения, ClickHouse может оказаться выгоднее других систем с точки зрения <a href="https://clickhouse.yandex/benchmark.html">скорости обработки запросов</a>, эффективности использования ресурсов и простоты эксплуатации.
Поэтому нам выгодно сделать ClickHouse открытым сегодня.
<img src="https://habrastorage.org/files/37e/1b6/556/37e1b65562844a7a9c2477f9b5f7cda1.png"/>
<i>На картинке: карта кликов в Яндекс.Метрике и соответствующий запрос в ClickHouse</i>
Изначально мы разрабатывали ClickHouse исключительно для задач <a href="https://metrika.yandex.ru/">Яндекс.Метрики</a> — для того, чтобы строить отчёты в интерактивном режиме по неагрегированным логам пользовательских действий. В связи с тем, что система является полноценной СУБД и обладает весьма широкой функциональностью, уже в начале использования в 2012 году, была написана <a href="https://clickhouse.yandex/reference_ru.html">подробная документация</a>. Это отличает ClickHouse от многих типичных внутренних разработок — специализированных и встраиваемых структур данных для решения конкретных задач, таких как, например, Metrage и OLAPServer, о которых я рассказывал в <a href="http://habrahabr.ru/company/yandex/blog/273305/">предыдущей статье</a>.
Это привело к тому, что ClickHouse постепенно распространился по многим отделам Яндекса. Неожиданно оказалось, что система может быть установлена по инструкции и работает "из коробки", без необходимости привлечения разработчиков. ClickHouse стал использоваться в Директе, Маркете, Почте, AdFox, Вебмастере, в мониторингах и в бизнес-аналитике. Каждый раз ClickHouse позволял решить некоторую задачу, для которой раньше не было подходящих инструментов, либо решить задачу на порядки более эффективно.
Постепенно возник спрос на использование ClickHouse не только во внутренних продуктах Яндекса. Например, в 2013 году, ClickHouse применялся для анализа метаданных о событиях <a href="https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/">эксперимента LHCb в CERN</a>. Система могла бы использоваться более широко, но в то время этому мешал закрытый статус. Другой пример: open-source технология <a href="https://tech.yandex.ru/tank/">Яндекс.Танк</a> внутри Яндекса использует ClickHouse для хранения данных телеметрии, тогда как для внешних пользователей, в качестве базы данных был доступен только MySQL, который плохо подходит для данной задачи.
По мере расширения пользовательской базы, возникла необходимость тратить на разработку чуть больше усилий, хоть и незначительно по сравнению с задачами Метрики. Зато в качестве отдачи, мы получаем повышение качества продукта, особенно в плане юзабилити.
Расширение пользовательской базы позволяет рассматривать примеры использования, о которых другим способом было бы трудно догадаться. Также это позволяет раньше находить баги и неудобства, которые имеют значение в том числе и для основного применения ClickHouse в Метрике. Без сомнения, это делает продукт качественнее. Поэтому нам выгодно сделать ClickHouse открытым сегодня.
<h1>Как перестать бояться и начать использовать ClickHouse</h1> <h1>Как перестать бояться и начать использовать ClickHouse</h1>
@ -70,14 +75,14 @@
Пакет clickhouse-client содержит программу <a href="https://clickhouse.yandex/reference_ru.html#%D0%9A%D0%BB%D0%B8%D0%B5%D0%BD%D1%82%20%D0%BA%D0%BE%D0%BC%D0%B0%D0%BD%D0%B4%D0%BD%D0%BE%D0%B9%20%D1%81%D1%82%D1%80%D0%BE%D0%BA%D0%B8">clickhouse-client</a> — клиент ClickHouse для работы в интерактивном режиме. Пакет clickhouse-server-base содержит бинарник clickhouse-server, а clickhouse-server-common — конфигурационные файлы к серверу. Пакет clickhouse-client содержит программу <a href="https://clickhouse.yandex/reference_ru.html#%D0%9A%D0%BB%D0%B8%D0%B5%D0%BD%D1%82%20%D0%BA%D0%BE%D0%BC%D0%B0%D0%BD%D0%B4%D0%BD%D0%BE%D0%B9%20%D1%81%D1%82%D1%80%D0%BE%D0%BA%D0%B8">clickhouse-client</a> — клиент ClickHouse для работы в интерактивном режиме. Пакет clickhouse-server-base содержит бинарник clickhouse-server, а clickhouse-server-common — конфигурационные файлы к серверу.
Конфигурационные файлы сервера находятся в /etc/clickhouse-server/. Главное, на что следует обратить внимание перед началом работы — элемент path — место хранения данных. Не обязательно модифицировать непосредственно файл config.xml — это не очень удобно при обновлении пакетов. Вместо этого, можно переопределить нужные элементы <a href="https://clickhouse.yandex/reference_ru.html#%D0%9A%D0%BE%D0%BD%D1%84%D0%B8%D0%B3%D1%83%D1%80%D0%B0%D1%86%D0%B8%D0%BE%D0%BD%D0%BD%D1%8B%D0%B5%20%D1%84%D0%B0%D0%B9%D0%BB%D1%8B">в файлах в config.d директории</a>. Конфигурационные файлы сервера находятся в /etc/clickhouse-server/. Главное, на что следует обратить внимание перед началом работы — элемент path — место хранения данных. Необязательно модифицировать непосредственно файл config.xml — это не очень удобно при обновлении пакетов. Вместо этого можно переопределить нужные элементы <a href="https://clickhouse.yandex/reference_ru.html#%D0%9A%D0%BE%D0%BD%D1%84%D0%B8%D0%B3%D1%83%D1%80%D0%B0%D1%86%D0%B8%D0%BE%D0%BD%D0%BD%D1%8B%D0%B5%20%D1%84%D0%B0%D0%B9%D0%BB%D1%8B">в файлах в config.d директории</a>.
Также имеет смысл обратить внимание на <a href="https://clickhouse.yandex/reference_ru.html#%D0%9F%D1%80%D0%B0%D0%B2%D0%B0%20%D0%B4%D0%BE%D1%81%D1%82%D1%83%D0%BF%D0%B0">настройки прав доступа</a>. Также имеет смысл обратить внимание на <a href="https://clickhouse.yandex/reference_ru.html#%D0%9F%D1%80%D0%B0%D0%B2%D0%B0%20%D0%B4%D0%BE%D1%81%D1%82%D1%83%D0%BF%D0%B0">настройки прав доступа</a>.
Сервер не запускается самостоятельно при установке пакета и не перезапускается сам при обновлении. Сервер не запускается самостоятельно при установке пакета и не перезапускается сам при обновлении.
Для запуска сервера, выполните: Для запуска сервера, выполните:
<source lang="Bash">sudo service clickhouse-server start</source> <source lang="Bash">sudo service clickhouse-server start</source>
Логи сервера расположены по-умолчанию в директории /var/log/clickhouse-server/ Логи сервера расположены по-умолчанию в директории /var/log/clickhouse-server/ .
После появления сообщения Ready for connections в логе, сервер будет принимать соединения. После появления сообщения Ready for connections в логе, сервер будет принимать соединения.
Для подключения к серверу, используйте программу clickhouse-client. Для подключения к серверу, используйте программу clickhouse-client.
@ -230,17 +235,17 @@ ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
</spoiler> </spoiler>
Мы создали таблицу типа <a href="https://clickhouse.yandex/reference_ru.html#MergeTree">MergeTree</a>. Таблицы семейства MergeTree рекомендуется использовать для любых серьёзных применений. Такие таблицы содержит первичный ключ, по которому данные инкрементально сортируются, что позволяет быстро выполнять запросы по диапазону первичного ключа. Мы создали таблицу типа <a href="https://clickhouse.yandex/reference_ru.html#MergeTree">MergeTree</a>. Таблицы семейства MergeTree рекомендуется использовать для любых серьёзных применений. Такие таблицы содержит первичный ключ, по которому данные инкрементально сортируются, что позволяет быстро выполнять запросы по диапазону первичного ключа.
Например, если у нас есть логи рекламной сети, и нам нужно показывать отчёты для конкретных клиентов — рекламодателей, то первичный ключ в таблице должен начинаться на идентификатор клиента ClientId, чтобы для получения данных для одного клиента, достаточно было только прочитать небольшой диапазон данных. Например, если у нас есть логи рекламной сети, и нам нужно показывать отчёты для конкретных клиентов — рекламодателей, то первичный ключ в таблице должен начинаться на идентификатор клиента, чтобы для получения данных для одного клиента, достаточно было только прочитать небольшой диапазон данных.
<h3>Загружаем данные в таблицу</h3> <h3>Загружаем данные в таблицу</h3>
<source lang="Bash">xz -v -c -d &lt; ontime.csv.xz | clickhouse-client --query="INSERT INTO ontime FORMAT CSV"</source> <source lang="Bash">xz -v -c -d &lt; ontime.csv.xz | clickhouse-client --query="INSERT INTO ontime FORMAT CSV"</source>
Запрос INSERT в ClickHouse позволяет загружать данные в любом <a href="https://clickhouse.yandex/reference_ru.html#%D0%A4%D0%BE%D1%80%D0%BC%D0%B0%D1%82%D1%8B">поддерживаемом формате</a>. При этом, на загрузку данных расходуется O(1) памяти. На вход запроса INSERT можно передать любой объём данных. Вставлять данные всегда следует <a href="https://clickhouse.yandex/reference_ru.html#%D0%9F%D1%80%D0%BE%D0%B8%D0%B7%D0%B2%D0%BE%D0%B4%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D0%BE%D1%81%D1%82%D1%8C%20%D0%BF%D1%80%D0%B8%20%D0%B2%D1%81%D1%82%D0%B0%D0%B2%D0%BA%D0%B5%20%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85.">пачками не слишком маленького размера</a>. При этом, вставка блоков данных размера до max_insert_block_size (= 1&nbsp;048&nbsp;576 строк по-умолчанию), является атомарной: блок данных либо целиком вставится, либо целиком не вставится. В случае разрыва соединения в процессе вставки, вы можете не знать, вставился ли блок данных. Для достижения exactly once семантики, для <a href="https://clickhouse.yandex/reference_ru.html#%D0%A0%D0%B5%D0%BF%D0%BB%D0%B8%D0%BA%D0%B0%D1%86%D0%B8%D1%8F%20%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85">реплицированных таблиц</a>, поддерживается идемпотентность: вы можете вставить один и тот же блок данных повторно, возможно, на другую реплику, и он будет вставлен только один раз. В данном примере, мы вставляем данные из localhost, поэтому мы не беспокоимся о формировании пачек и exactly-once семантике. Запрос INSERT в ClickHouse позволяет загружать данные в любом <a href="https://clickhouse.yandex/reference_ru.html#%D0%A4%D0%BE%D1%80%D0%BC%D0%B0%D1%82%D1%8B">поддерживаемом формате</a>. При этом на загрузку данных расходуется O(1) памяти. На вход запроса INSERT можно передать любой объём данных. Вставлять данные всегда следует <a href="https://clickhouse.yandex/reference_ru.html#%D0%9F%D1%80%D0%BE%D0%B8%D0%B7%D0%B2%D0%BE%D0%B4%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D0%BE%D1%81%D1%82%D1%8C%20%D0%BF%D1%80%D0%B8%20%D0%B2%D1%81%D1%82%D0%B0%D0%B2%D0%BA%D0%B5%20%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85.">пачками не слишком маленького размера</a>. При этом вставка блоков данных размера до max_insert_block_size (= 1&nbsp;048&nbsp;576 строк по-умолчанию), является атомарной: блок данных либо целиком вставится, либо целиком не вставится. В случае разрыва соединения в процессе вставки, вы можете не знать, вставился ли блок данных. Для достижения exactly-once семантики, для <a href="https://clickhouse.yandex/reference_ru.html#%D0%A0%D0%B5%D0%BF%D0%BB%D0%B8%D0%BA%D0%B0%D1%86%D0%B8%D1%8F%20%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85">реплицированных таблиц</a>, поддерживается идемпотентность: вы можете вставить один и тот же блок данных повторно, возможно, на другую реплику, и он будет вставлен только один раз. В данном примере, мы вставляем данные из localhost, поэтому мы не беспокоимся о формировании пачек и exactly-once семантике.
Запрос INSERT в таблицы типа MergeTree является неблокирующим, ровно как и SELECT. После загрузки данных, или даже во время процесса загрузки, мы уже можем выполнять SELECT-ы. Запрос INSERT в таблицы типа MergeTree является неблокирующим, ровно как и SELECT. После загрузки данных или даже во время процесса загрузки мы уже можем выполнять SELECT-ы.
<small>В данном примере, некоторая неоптимальность состоит в том, что в таблице используется тип данных String тогда, когда подошёл бы <a href="https://clickhouse.yandex/reference_ru.html#Enum">Enum</a> или числовой тип. Если множество разных значений строк заведомо небольшое (пример: название операционной системы, производитель мобильного телефона), то для максимальной производительности, мы рекомендуем использовать Enum-ы или числа. Если множество строк потенциально неограничено (пример: поисковый запрос, URL), то используйте тип данных String. В данном примере, некоторая неоптимальность состоит в том, что в таблице используется тип данных String тогда, когда подошёл бы <a href="https://clickhouse.yandex/reference_ru.html#Enum">Enum</a> или числовой тип. Если множество разных значений строк заведомо небольшое (пример: название операционной системы, производитель мобильного телефона), то для максимальной производительности, мы рекомендуем использовать Enum-ы или числа. Если множество строк потенциально неограничено (пример: поисковый запрос, URL), то используйте тип данных String.
Во вторых отметим, что в данном примере, структура таблицы содержит избыточные столбцы Year, Quarter, Month, DayofMonth, DayOfWeek, тогда как достаточно одного FlightDate. Скорее всего, это сделано для эффективной работы других СУБД, в которых функции для работы с датой и временем, могут быть неэффективными. В случае ClickHouse, в этом нет необходимости, так как <a href="https://clickhouse.yandex/reference_ru.html#%D0%A4%D1%83%D0%BD%D0%BA%D1%86%D0%B8%D0%B8%20%D0%B4%D0%BB%D1%8F%20%D1%80%D0%B0%D0%B1%D0%BE%D1%82%D1%8B%20%D1%81%20%D0%B4%D0%B0%D1%82%D0%B0%D0%BC%D0%B8%20%D0%B8%20%D0%B2%D1%80%D0%B5%D0%BC%D0%B5%D0%BD%D0%B5%D0%BC">соответствующие функции</a> хорошо оптимизированы. Впрочем, в наличии лишних столбцов нет проблемы: так как ClickHouse — это <a href="https://en.wikipedia.org/wiki/Column-oriented_DBMS">столбцовая СУБД</a>, вы можете позволить себе иметь в таблице достаточно много столбцов. Сотни столбцов — это нормально для ClickHouse.</small> Во вторых, отметим, что в данном примере структура таблицы содержит избыточные столбцы Year, Quarter, Month, DayofMonth, DayOfWeek, тогда как достаточно одного FlightDate. Скорее всего, это сделано для эффективной работы других СУБД, в которых функции для работы с датой и временем, могут работать недостаточно быстро. В случае ClickHouse, в этом нет необходимости, так как <a href="https://clickhouse.yandex/reference_ru.html#%D0%A4%D1%83%D0%BD%D0%BA%D1%86%D0%B8%D0%B8%20%D0%B4%D0%BB%D1%8F%20%D1%80%D0%B0%D0%B1%D0%BE%D1%82%D1%8B%20%D1%81%20%D0%B4%D0%B0%D1%82%D0%B0%D0%BC%D0%B8%20%D0%B8%20%D0%B2%D1%80%D0%B5%D0%BC%D0%B5%D0%BD%D0%B5%D0%BC">соответствующие функции</a> хорошо оптимизированы. Впрочем, в наличии лишних столбцов нет проблемы: так как ClickHouse — это <a href="https://en.wikipedia.org/wiki/Column-oriented_DBMS">столбцовая СУБД</a>, вы можете позволить себе иметь в таблице достаточно много столбцов. Сотни столбцов — это нормально для ClickHouse.
<h3>Примеры работы с загруженными данными</h3> <h3>Примеры работы с загруженными данными</h3>
@ -393,7 +398,7 @@ LIMIT 20
Как и ожидается, более-менее долгие запросы работают в несколько раз быстрее, если их выполнять на трёх серверах, а не на одном. <spoiler title="Пример"> Как и ожидается, более-менее долгие запросы работают в несколько раз быстрее, если их выполнять на трёх серверах, а не на одном. <spoiler title="Пример">
<img src="https://habrastorage.org/files/ece/020/129/ece020129fdf4a18a6e75daf2e699cb9.png"/> <img src="https://habrastorage.org/files/ece/020/129/ece020129fdf4a18a6e75daf2e699cb9.png"/>
Можно заметить, что результат рассчёта квантилей слегка отличается. Это происходит, потому что реализация алгоритма <a href="https://github.com/tdunning/t-digest/raw/master/docs/t-digest-paper/histo.pdf">t-digest</a> является недетерминированной — зависит от порядка обработки данных.</spoiler> Можно заметить, что результат расчёта квантилей слегка отличается. Это происходит потому, что реализация алгоритма <a href="https://github.com/tdunning/t-digest/raw/master/docs/t-digest-paper/histo.pdf">t-digest</a> является недетерминированной — зависит от порядка обработки данных.</spoiler>
В данном примере, мы использовали кластер из трёх шардов, каждый шард которого состоит из одной реплики. Для реальных задач, в целях отказоустойчивости, каждый шард должен состоять из двух или трёх реплик, расположенных в разных датацентрах. (Поддерживается произвольное количество реплик). В данном примере, мы использовали кластер из трёх шардов, каждый шард которого состоит из одной реплики. Для реальных задач, в целях отказоустойчивости, каждый шард должен состоять из двух или трёх реплик, расположенных в разных датацентрах. (Поддерживается произвольное количество реплик).
<spoiler title="Конфигурация кластера из одного шарда, на котором данные расположены в трёх репликах"> <spoiler title="Конфигурация кластера из одного шарда, на котором данные расположены в трёх репликах">
@ -419,9 +424,9 @@ LIMIT 20
&lt;/remote_servers&gt; &lt;/remote_servers&gt;
</source> </source>
</spoiler> </spoiler>
Для работы репликации (хранение метаданных и координация действий), требуется <a href="http://zookeeper.apache.org/">ZooKeeper</a>. ClickHouse самостоятельно будет обеспечивать консистентность данных на репликах и производит восстановление после сбоя. Рекомендуется расположить кластер ZooKeeper на отдельных серверах. Для работы репликации (хранение метаданных и координация действий), требуется <a href="http://zookeeper.apache.org/">ZooKeeper</a>. ClickHouse самостоятельно будет обеспечивать консистентность данных на репликах и производить восстановление после сбоев. Рекомендуется расположить кластер ZooKeeper на отдельных серверах.
<small>На самом деле, использование ZooKeeper не обязательно: в самых простых случаях, вы можете дублировать данные, записывая их на все реплики вручную, и не использовать встроенный механизм репликации. Но такой способ не рекомендуется — ведь в таком случае, ClickHouse не сможет обеспечивать консистентность данных на репликах.</small> На самом деле использование ZooKeeper не обязательно: в самых простых случаях, вы можете дублировать данные, записывая их на все реплики вручную, и не использовать встроенный механизм репликации. Но такой способ не рекомендуется — ведь в таком случае ClickHouse не сможет обеспечивать консистентность данных на репликах.
<spoiler title="Пропишите адреса ZooKeeper в конфигурационном файле"> <spoiler title="Пропишите адреса ZooKeeper в конфигурационном файле">
<source lang="XML"> <source lang="XML">
@ -449,7 +454,7 @@ LIMIT 20
&lt;replica&gt;01&lt;/replica&gt; &lt;replica&gt;01&lt;/replica&gt;
&lt;/macros&gt; &lt;/macros&gt;
</source> </source>
При создании реплицированной таблицы, если других реплик ещё нет, то создаётся первая реплика, а если есть — создаётся новая реплика и клонирует данные существующих реплик. Вы можете сразу создать все таблицы-реплики и затем загрузить в них данные, либо сначала создать часть реплик, а затем добавить другие, уже после загрузки или во время загрузки данных. Если при создании реплицированной таблицы других реплик ещё нет, то создаётся первая реплика, а если есть — создаётся новая реплика и клонирует данные существующих реплик. Вы можете сразу создать все таблицы-реплики и затем загрузить в них данные, либо сначала создать часть реплик, а затем добавить другие, уже после загрузки или во время загрузки данных.
<source lang="SQL"> <source lang="SQL">
CREATE TABLE ontime_replica (...) CREATE TABLE ontime_replica (...)
@ -463,10 +468,8 @@ ENGINE = ReplicatedMergeTree(
Здесь видно, что мы используем тип таблицы <a href="https://clickhouse.yandex/reference_ru.html#ReplicatedMergeTree">ReplicatedMergeTree</a>, указывая в качестве параметров путь в ZooKeeper, содержащий идентификатор шарда, а также идентификатор реплики. Здесь видно, что мы используем тип таблицы <a href="https://clickhouse.yandex/reference_ru.html#ReplicatedMergeTree">ReplicatedMergeTree</a>, указывая в качестве параметров путь в ZooKeeper, содержащий идентификатор шарда, а также идентификатор реплики.
<source lang="SQL">INSERT INTO ontime_replica SELECT * FROM ontime;</source> <source lang="SQL">INSERT INTO ontime_replica SELECT * FROM ontime;</source>
Репликация работает в режиме multi-master. Вы можете вставлять данные на любую реплику, и данные автоматически разъезжаются по всем репликам. При этом, репликация асинхронная, и в заданный момент времени, реплики могут содержать не все недавно записанные данные. Для записи данных, достаточно доступности хотя бы одной реплики. Остальные реплики будут скачивать новые данные и восстанавливать консистентность как только станут активными. Такая схема допускает возможность потери только что вставленных данных. Репликация работает в режиме multi-master. Вы можете вставлять данные на любую реплику, и данные автоматически разъезжаются по всем репликам. При этом репликация асинхронная, и в заданный момент времени, реплики могут содержать не все недавно записанные данные. Для записи данных, достаточно доступности хотя бы одной реплики. Остальные реплики будут скачивать новые данные и восстанавливать консистентность как только станут активными. Такая схема допускает возможность потери только что вставленных данных.
<h1>Как вы можете повлиять на развитие ClickHouse?</h1> <h1>Как вы можете повлиять на развитие ClickHouse?</h1>
Если у вас возникли вопросы, вы можете задать их в комментариях к этой статье, либо на <a href="http://stackoverflow.com/">Stackoverflow</a> с тегом «clickhouse». Также вы можете создать тему для обсуждения в <a href="https://groups.google.com/group/clickhouse">группе</a> или написать своё предложение на рассылку clickhouse-feedback@yandex-team.ru. Если у вас возникли вопросы, вы можете задать их в комментариях к этой статье, либо на <a href="http://stackoverflow.com/">StackOverflow</a> с тегом «clickhouse». Также вы можете создать тему для обсуждения в <a href="https://groups.google.com/group/clickhouse">группе</a> или написать своё предложение на рассылку clickhouse-feedback@yandex-team.ru. А если вам хочется попробовать поработать над ClickHouse изнутри, вы можете присоединиться к нашей команде в Яндексе. У нас открыты <a href="https://yandex.ru/jobs/vacancies/dev/?tags=c%2B%2B">вакансии</a> и <a href="https://yandex.ru/jobs/vacancies/interns/summer">стажировки</a>.
У вас есть возможность разрабатывать ClickHouse, присоединившись к нашей команде в Яндексе. Для этого достаточно знать C++. Выберите <a href="https://yandex.ru/jobs/vacancies/dev/?tags=c%2B%2B">любую вакансию из списка</a> и укажите, что хотите попасть в группу разработки ClickHouse. Также у нас открыт приём на <a href="https://yandex.ru/jobs/vacancies/interns/summer">стажировку</a>.

View File

@ -52,6 +52,8 @@ namespace Poco { class TaskManager; }
class BaseDaemon : public Poco::Util::ServerApplication class BaseDaemon : public Poco::Util::ServerApplication
{ {
friend class SignalListener;
public: public:
BaseDaemon(); BaseDaemon();
~BaseDaemon(); ~BaseDaemon();
@ -132,6 +134,14 @@ protected:
/// Используется при exitOnTaskError() /// Используется при exitOnTaskError()
void handleNotification(Poco::TaskFailedNotification *); void handleNotification(Poco::TaskFailedNotification *);
/// thread safe
virtual void handleSignal(int signal_id);
/// реализация обработки сигналов завершения через pipe не требует блокировки сигнала с помощью sigprocmask во всех потоках
void waitForTerminationRequest() override;
/// thread safe
virtual void onInterruptSignals(int signal_id);
std::unique_ptr<Poco::TaskManager> task_manager; std::unique_ptr<Poco::TaskManager> task_manager;
/// Создание и автоматическое удаление pid файла. /// Создание и автоматическое удаление pid файла.
@ -156,8 +166,7 @@ protected:
PID pid; PID pid;
/// Получен ли сигнал на завершение? Этот флаг устанавливается в BaseDaemonApplication. std::atomic_bool is_cancelled{false};
bool is_cancelled = false;
/// Флаг устанавливается по сообщению из Task (при аварийном завершении). /// Флаг устанавливается по сообщению из Task (при аварийном завершении).
bool task_failed = false; bool task_failed = false;
@ -179,4 +188,8 @@ protected:
std::unique_ptr<GraphiteWriter> graphite_writer; std::unique_ptr<GraphiteWriter> graphite_writer;
boost::optional<size_t> layer; boost::optional<size_t> layer;
std::mutex signal_handler_mutex;
std::condition_variable signal_event;
size_t terminate_signals_counter = 0;
}; };

View File

@ -131,9 +131,9 @@ static void call_default_signal_handler(int sig)
using ThreadNumber = decltype(Poco::ThreadNumber::get()); using ThreadNumber = decltype(Poco::ThreadNumber::get());
static const size_t buf_size = sizeof(int) + sizeof(siginfo_t) + sizeof(ucontext_t) + sizeof(ThreadNumber); static const size_t buf_size = sizeof(int) + sizeof(siginfo_t) + sizeof(ucontext_t) + sizeof(ThreadNumber);
using signal_function = void(int, siginfo_t*, void*);
/** Обработчик сигналов HUP / USR1 */ static void writeSignalIDtoSignalPipe(int sig)
static void close_logs_signal_handler(int sig, siginfo_t * info, void * context)
{ {
char buf[buf_size]; char buf[buf_size];
DB::WriteBufferFromFileDescriptor out(signal_pipe.write_fd, buf_size, buf); DB::WriteBufferFromFileDescriptor out(signal_pipe.write_fd, buf_size, buf);
@ -141,10 +141,21 @@ static void close_logs_signal_handler(int sig, siginfo_t * info, void * context)
out.next(); out.next();
} }
/** Обработчик сигналов HUP / USR1 */
static void closeLogsSignalHandler(int sig, siginfo_t * info, void * context)
{
writeSignalIDtoSignalPipe(sig);
}
static void terminateRequestedSignalHandler(int sig, siginfo_t * info, void * context)
{
writeSignalIDtoSignalPipe(sig);
}
/** Обработчик некоторых сигналов. Выводит информацию в лог (если получится). /** Обработчик некоторых сигналов. Выводит информацию в лог (если получится).
*/ */
static void fault_signal_handler(int sig, siginfo_t * info, void * context) static void faultSignalHandler(int sig, siginfo_t * info, void * context)
{ {
char buf[buf_size]; char buf[buf_size];
DB::WriteBufferFromFileDescriptor out(signal_pipe.write_fd, buf_size, buf); DB::WriteBufferFromFileDescriptor out(signal_pipe.write_fd, buf_size, buf);
@ -174,7 +185,9 @@ static bool already_printed_stack_trace = false;
class SignalListener : public Poco::Runnable class SignalListener : public Poco::Runnable
{ {
public: public:
SignalListener() : log(&Logger::get("BaseDaemon")) SignalListener(BaseDaemon & daemon_)
: log(&Logger::get("BaseDaemon"))
, daemon(daemon_)
{ {
} }
@ -204,6 +217,12 @@ public:
onTerminate(message, thread_num); onTerminate(message, thread_num);
} }
else if (sig == SIGINT ||
sig == SIGQUIT ||
sig == SIGTERM)
{
daemon.handleSignal(sig);
}
else else
{ {
siginfo_t info; siginfo_t info;
@ -221,8 +240,9 @@ public:
private: private:
Logger * log; Logger * log;
BaseDaemon & daemon;
private:
void onTerminate(const std::string & message, ThreadNumber thread_num) const void onTerminate(const std::string & message, ThreadNumber thread_num) const
{ {
LOG_ERROR(log, "(from thread " << thread_num << ") " << message); LOG_ERROR(log, "(from thread " << thread_num << ") " << message);
@ -739,42 +759,31 @@ void BaseDaemon::initialize(Application& self)
std::set_terminate(terminate_handler); std::set_terminate(terminate_handler);
/// Ставим обработчики сигналов /// Ставим обработчики сигналов
auto add_signal_handler =
[](const std::vector<int> & signals, signal_function handler)
{
struct sigaction sa; struct sigaction sa;
memset(&sa, 0, sizeof(sa)); memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = fault_signal_handler; sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO; sa.sa_flags = SA_SIGINFO;
{ {
int signals[] = {SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, 0};
if (sigemptyset(&sa.sa_mask)) if (sigemptyset(&sa.sa_mask))
throw Poco::Exception("Cannot set signal handler."); throw Poco::Exception("Cannot set signal handler.");
for (size_t i = 0; signals[i]; ++i) for (auto signal : signals)
if (sigaddset(&sa.sa_mask, signals[i])) if (sigaddset(&sa.sa_mask, signal))
throw Poco::Exception("Cannot set signal handler."); throw Poco::Exception("Cannot set signal handler.");
for (size_t i = 0; signals[i]; ++i) for (auto signal : signals)
if (sigaction(signals[i], &sa, 0)) if (sigaction(signal, &sa, 0))
throw Poco::Exception("Cannot set signal handler."); throw Poco::Exception("Cannot set signal handler.");
} }
};
sa.sa_sigaction = close_logs_signal_handler; add_signal_handler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE}, faultSignalHandler);
add_signal_handler({SIGHUP, SIGUSR1}, closeLogsSignalHandler);
{ add_signal_handler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler);
int signals[] = {SIGHUP, SIGUSR1, 0};
if (sigemptyset(&sa.sa_mask))
throw Poco::Exception("Cannot set signal handler.");
for (size_t i = 0; signals[i]; ++i)
if (sigaddset(&sa.sa_mask, signals[i]))
throw Poco::Exception("Cannot set signal handler.");
for (size_t i = 0; signals[i]; ++i)
if (sigaction(signals[i], &sa, 0))
throw Poco::Exception("Cannot set signal handler.");
}
/// Ставим ErrorHandler для потоков /// Ставим ErrorHandler для потоков
static KillingErrorHandler killing_error_handler; static KillingErrorHandler killing_error_handler;
@ -783,7 +792,7 @@ void BaseDaemon::initialize(Application& self)
/// Выведем ревизию демона /// Выведем ревизию демона
logRevision(); logRevision();
signal_listener.reset(new SignalListener); signal_listener.reset(new SignalListener(*this));
signal_listener_thread.start(*signal_listener); signal_listener_thread.start(*signal_listener);
graphite_writer.reset(new GraphiteWriter("graphite")); graphite_writer.reset(new GraphiteWriter("graphite"));
@ -890,3 +899,35 @@ void BaseDaemon::PID::clear()
file.clear(); file.clear();
} }
} }
void BaseDaemon::handleSignal(int signal_id)
{
if (signal_id == SIGINT ||
signal_id == SIGQUIT ||
signal_id == SIGTERM)
{
std::unique_lock<std::mutex> lock(signal_handler_mutex);
{
++terminate_signals_counter;
signal_event.notify_all();
}
onInterruptSignals(signal_id);
}
else
throw DB::Exception(std::string("Unsupported signal: ") + strsignal(signal_id));
}
void BaseDaemon::onInterruptSignals(int signal_id)
{
is_cancelled = true;
LOG_INFO(&logger(), "Received termination signal(" << strsignal(signal_id) << ")");
}
void BaseDaemon::waitForTerminationRequest()
{
std::unique_lock<std::mutex> lock(signal_handler_mutex);
signal_event.wait(lock, [this](){ return terminate_signals_counter > 0; });
}

@ -1 +1 @@
Subproject commit 27aa9e91c23fab09b5f27174be89cd09f975aaa7 Subproject commit a482f9ec80b4c10e92b3d647b3558978cc453d2e