From 0ce74ccc286eda31e346dff970096738bcdeeb98 Mon Sep 17 00:00:00 2001 From: myrrc Date: Thu, 16 Jul 2020 23:59:32 +0300 Subject: [PATCH] updated interface, bug investigation --- src/Columns/ColumnLowCardinality.h | 7 +++ src/Columns/ColumnUnique.h | 17 +++++- src/Columns/IColumnUnique.h | 8 ++- src/Columns/ReverseIndex.h | 8 +-- src/Functions/array/arrayIndex.h | 97 +++++++++++++----------------- 5 files changed, 73 insertions(+), 64 deletions(-) diff --git a/src/Columns/ColumnLowCardinality.h b/src/Columns/ColumnLowCardinality.h index d874d4675fd..8e0fbb9bb57 100644 --- a/src/Columns/ColumnLowCardinality.h +++ b/src/Columns/ColumnLowCardinality.h @@ -15,6 +15,13 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +/** + * How data is stored (in a nutshell): + * we have a dictionary @e reverse_index in ColumnUnique that holds pairs (DataType, UIntXX) and a column + * with UIntXX holding actual data indices. + * To obtain the value's index, call #getOrFindIndex. + * To operate on the data (so called indices column), call #getIndexes. + */ class ColumnLowCardinality final : public COWHelper { friend class COWHelper; diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index 079e25fc701..650002b6611 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -112,9 +112,22 @@ public: UInt128 getHash() const override { return hash.getHash(*getRawColumnPtr()); } - inline UInt64 getIndexByValue(const StringRef& value) const override + inline UInt64 getOrFindIndex(const StringRef& value) const override { - return reverse_index.getInsertionPointConst(value); + if (std::optional res = reverse_index.getIndex(value); res) + return res.value(); + + auto& nested = *getNestedColumn(); + + for (size_t i = 0; i < nested.size(); ++i) { + std::cout << nested.getDataAt(i) << std::endl; + if (nested.getDataAt(i) == value) + return i; + }; + + throw Exception( + "Trying to find the value that is not present in the index", + ErrorCodes::LOGICAL_ERROR); } private: diff --git a/src/Columns/IColumnUnique.h b/src/Columns/IColumnUnique.h index e310f936700..e731ca6601e 100644 --- a/src/Columns/IColumnUnique.h +++ b/src/Columns/IColumnUnique.h @@ -71,7 +71,11 @@ public: /** * Given some value (usually, of type @e ColumnType) @p value that is convertible to DB::StringRef, obtains its - * index in the DB::ColumnUnique::reverse_index hastable. + * index in the DB::ColumnUnique::reverse_index hashtable (std::nullopt if not found). + * + * The reverse index (StringRef => UInt64) is built lazily, so there are two variants: + * - On the function call it's present. Therefore we obtain the index in O(1). + * - The reverse index is absent. We find the index in O(dictionary size) by performing the linear search. * * @see DB::ReverseIndex * @see DB::ColumnUnique @@ -81,7 +85,7 @@ public: * region, so it can be easily represented as a @e StringRef. So we pass that ref to this function and get its * index in the dictionary, which can be used to operate with the indices column. */ - virtual inline UInt64 getIndexByValue(const StringRef& value) const = 0; + virtual inline UInt64 getOrFindIndex(const StringRef& value) const = 0; void insert(const Field &) override { diff --git a/src/Columns/ReverseIndex.h b/src/Columns/ReverseIndex.h index 90120fc6745..b474ee8d17d 100644 --- a/src/Columns/ReverseIndex.h +++ b/src/Columns/ReverseIndex.h @@ -329,8 +329,8 @@ public: /// If index is not built, builds it. UInt64 getInsertionPoint(const StringRef & data); - /// If index is not found, throws a ErrorCodes::LOGICAL_ERROR - UInt64 getInsertionPointConst(const StringRef & data) const; + /// Returns the found index if the #index is built, otherwise, searches for it linearly. + std::optional getIndex(const StringRef & data) const; UInt64 lastInsertionPoint() const { return size() + base_index; } @@ -521,10 +521,10 @@ UInt64 ReverseIndex::getInsertionPoint(const StringRef & } template -UInt64 ReverseIndex::getInsertionPointConst(const StringRef & data) const +std::optional ReverseIndex::getIndex(const StringRef & data) const { if (!index) - throw Exception("No built index in ReverseIndex", ErrorCodes::LOGICAL_ERROR); + return {}; using IteratorType = typename IndexMapType::iterator; IteratorType iterator; diff --git a/src/Functions/array/arrayIndex.h b/src/Functions/array/arrayIndex.h index 2ccebef186e..d66c5606fc2 100644 --- a/src/Functions/array/arrayIndex.h +++ b/src/Functions/array/arrayIndex.h @@ -634,7 +634,7 @@ inline bool allowNonLowCardinalityArg(const DataTypePtr& arr, const DataTypePtr& } -template +template class FunctionArrayIndex : public IFunction { public: @@ -642,7 +642,7 @@ public: static FunctionPtr create(const Context &) { return std::make_shared(); } private: - using ResultType = typename IndexConv::ResultType; + using ResultType = typename ConcreteAction::ResultType; using ResultColumnType = ColumnVector; /** @@ -682,7 +682,7 @@ private: * (s1, s1, s2, ...), (s2, s1, s2, ...), (s3, s1, s2, ...) */ template - [[gnu::nonnull]] inline bool + [[gnu::nonnull]] inline bool executeIntegral(const ColumnArray * col, Block & block, const ColumnNumbers & arguments, size_t result) { return (executeIntegralExpanded(col, block, arguments, result) || ...); @@ -716,13 +716,10 @@ private: const IColumn* item_arg = block.getByPosition(arguments[1]).column.get(); if (item_arg->onlyNull()) - ArrayIndexNumNullImpl::vector( - col_nested->getData(), - col_array->getOffsets(), - col_res->getData(), - null_map_data); + ArrayIndexNumNullImpl::vector( + col_nested->getData(), col_array->getOffsets(), col_res->getData(), null_map_data); else if (const auto item_arg_const = checkAndGetColumnConst>(item_arg)) - ArrayIndexNumImpl::vector( + ArrayIndexNumImpl::vector( col_nested->getData(), col_array->getOffsets(), item_arg_const->template getValue(), @@ -730,7 +727,7 @@ private: null_map_data, nullptr); else if (const auto item_arg_vector = checkAndGetColumn>(item_arg)) - ArrayIndexNumImpl::vector( + ArrayIndexNumImpl::vector( col_nested->getData(), col_array->getOffsets(), item_arg_vector->getData(), @@ -772,18 +769,19 @@ private: for (size_t i = 0; i < size; ++i) { StringRef elem = col_arg->getDataAt(i); - UInt64 value_index = col_lc->getDictionary().getIndexByValue(elem); + UInt64 value_index = col_lc->getDictionary().getOrFindIndex(elem); ArrayIndexNumImpl< /* Initial data type -- DB::ReverseIndex index */ UInt64, /* Resulting data type -- same */ UInt64, - IndexConv>::vector( - /* data -- indices column */ col_lc->getIndexes(), - col_array->getOffsets(), - /* target value */ value_index, - col_res->getData(), - null_map_data, - null_map_item); + ConcreteAction>:: + vector( + /* data -- indices column */ col_lc->getIndexes(), + col_array->getOffsets(), + /* target value */ value_index, + col_res->getData(), + null_map_data, + null_map_item); } block.getByPosition(result).column = std::move(col_res); @@ -805,23 +803,19 @@ private: if (item_arg->onlyNull()) { - ArrayIndexStringNullImpl::vector_const( - col_nested->getChars(), - col_array->getOffsets(), - col_nested->getOffsets(), - col_res->getData(), - null_map_data); + ArrayIndexStringNullImpl::vector_const( + col_nested->getChars(), col_array->getOffsets(), col_nested->getOffsets(), col_res->getData(), null_map_data); } else if (const auto item_arg_const = checkAndGetColumnConstStringOrFixedString(item_arg)) { - const ColumnString * item_const_string = + const ColumnString * item_const_string = checkAndGetColumn(&item_arg_const->getDataColumn()); - const ColumnFixedString * item_const_fixedstring = + const ColumnFixedString * item_const_fixedstring = checkAndGetColumn(&item_arg_const->getDataColumn()); if (item_const_string) - ArrayIndexStringImpl::vector_const( + ArrayIndexStringImpl::vector_const( col_nested->getChars(), col_array->getOffsets(), col_nested->getOffsets(), @@ -830,7 +824,7 @@ private: col_res->getData(), null_map_data); else if (item_const_fixedstring) - ArrayIndexStringImpl::vector_const( + ArrayIndexStringImpl::vector_const( col_nested->getChars(), col_array->getOffsets(), col_nested->getOffsets(), @@ -840,12 +834,12 @@ private: null_map_data); else throw Exception( - "Logical error: ColumnConst contains not String nor FixedString column", + "Logical error: ColumnConst contains not String nor FixedString column", ErrorCodes::ILLEGAL_COLUMN); } else if (const auto item_arg_vector = checkAndGetColumn(item_arg)) { - ArrayIndexStringImpl::vectorVector( + ArrayIndexStringImpl::vectorVector( col_nested->getChars(), col_array->getOffsets(), col_nested->getOffsets(), @@ -862,10 +856,9 @@ private: return true; } - [[gnu::nonnull]] bool executeConst(Block & block, const ColumnNumbers & arguments, size_t result) { - const ColumnConst * col_array = + const ColumnConst * col_array = checkAndGetColumnConst(block.getByPosition(arguments[0]).column.get()); if (!col_array) @@ -877,21 +870,20 @@ private: if (isColumnConst(*item_arg)) { - typename IndexConv::ResultType current = 0; + typename ConcreteAction::ResultType current = 0; const auto & value = (*item_arg)[0]; for (size_t i = 0, size = arr.size(); i < size; ++i) { if (applyVisitor(FieldVisitorAccurateEquals(), arr[i], value)) { - if (!IndexConv::apply(i, current)) + if (!ConcreteAction::apply(i, current)) break; } } block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst( - item_arg->size(), - static_cast(current)); + item_arg->size(), static_cast(current)); } else { @@ -929,7 +921,7 @@ private: if (hit) { - if (!IndexConv::apply(i, data[row])) + if (!ConcreteAction::apply(i, data[row])) break; } } @@ -952,25 +944,18 @@ private: auto [null_map_data, null_map_item] = nullMapsBuilder(block, arguments); if (item_arg.onlyNull()) - ArrayIndexGenericNullImpl::vector( - col_nested, - col->getOffsets(), - col_res->getData(), - null_map_data); + ArrayIndexGenericNullImpl::vector(col_nested, col->getOffsets(), col_res->getData(), null_map_data); else if (isColumnConst(item_arg)) - ArrayIndexGenericImpl::vector( - col_nested, - col->getOffsets(), - assert_cast(item_arg).getDataColumn(), - col_res->getData(), /// TODO This is wrong. - null_map_data, nullptr); + ArrayIndexGenericImpl::vector( + col_nested, + col->getOffsets(), + assert_cast(item_arg).getDataColumn(), + col_res->getData(), /// TODO This is wrong. + null_map_data, + nullptr); else - ArrayIndexGenericImpl::vector( - col_nested, - col->getOffsets(), - *item_arg.convertToFullColumnIfConst(), - col_res->getData(), - null_map_data, null_map_item); + ArrayIndexGenericImpl::vector( + col_nested, col->getOffsets(), *item_arg.convertToFullColumnIfConst(), col_res->getData(), null_map_data, null_map_item); block.getByPosition(result).column = std::move(col_res); return true; @@ -995,7 +980,7 @@ public: throw Exception("First argument for function " + getName() + " must be an array.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if (!arguments[1]->onlyNull() + if (!arguments[1]->onlyNull() && !allowArrayIndex(array_type->getNestedType(), arguments[1]) && !allowNonLowCardinalityArg(array_type->getNestedType(), arguments[1])) throw Exception("Types of array and 2nd argument of function " @@ -1003,7 +988,7 @@ public: + arguments[0]->getName() + " and " + arguments[1]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - return std::make_shared>(); + return std::make_shared>(); } /**