better subcolumns for arrays

This commit is contained in:
Anton Popov 2020-10-14 20:47:14 +03:00
parent cbe12a532e
commit c75262120a
16 changed files with 164 additions and 85 deletions

View File

@ -46,7 +46,7 @@ private:
};
template<int I>
auto get(const NameAndTypePair & name_and_type)
decltype(auto) get(const NameAndTypePair & name_and_type)
{
if constexpr (I == 0)
return name_and_type.name;

View File

@ -32,8 +32,8 @@ namespace ErrorCodes
}
DataTypeArray::DataTypeArray(const DataTypePtr & nested_)
: nested{nested_}
DataTypeArray::DataTypeArray(const DataTypePtr & nested_, size_t nested_level_)
: nested{nested_}, nested_level{nested_level_}
{
}
@ -148,6 +148,23 @@ namespace
offset_values.resize(i);
}
MutableColumnPtr getArraySizesPositionIndependent(ColumnArray & column_array)
{
ColumnArray::Offsets & offset_values = column_array.getOffsets();
MutableColumnPtr new_offsets = column_array.getOffsetsColumn().cloneEmpty();
if (offset_values.empty())
return new_offsets;
auto & new_offsets_values = assert_cast<ColumnVector<ColumnArray::Offset> &>(*new_offsets).getData();
new_offsets_values.reserve(offset_values.size());
new_offsets_values.push_back(offset_values[0]);
for (size_t i = 1; i < offset_values.size(); ++i)
new_offsets_values.push_back(offset_values[i] - offset_values[i - 1]);
return new_offsets;
}
}
@ -502,14 +519,23 @@ bool DataTypeArray::equals(const IDataType & rhs) const
return typeid(rhs) == typeid(*this) && nested->equals(*static_cast<const DataTypeArray &>(rhs).nested);
}
DataTypePtr DataTypeArray::getSubcolumnType(const String & subcolumn_name) const
DataTypePtr DataTypeArray::tryGetSubcolumnType(const String & subcolumn_name) const
{
ReadBufferFromString buf(subcolumn_name);
size_t dim;
if (checkString("size", buf) && tryReadIntText(dim, buf) && dim < getNumberOfDimensions())
return tryGetSubcolumnTypeImpl(subcolumn_name, 0);
}
DataTypePtr DataTypeArray::tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const
{
if (subcolumn_name == "size" + std::to_string(level))
return std::make_shared<DataTypeUInt64>();
return std::make_shared<DataTypeArray>(nested->getSubcolumnType(subcolumn_name));
DataTypePtr subcolumn;
if (const auto * nested_array = typeid_cast<const DataTypeArray *>(nested.get()))
subcolumn = nested_array->tryGetSubcolumnTypeImpl(subcolumn_name, level + 1);
else
subcolumn = nested->tryGetSubcolumnType(subcolumn_name);
return (subcolumn ? std::make_shared<DataTypeArray>(std::move(subcolumn), nested_level + 1) : subcolumn);
}
MutableColumnPtr DataTypeArray::getSubcolumn(const String & subcolumn_name, IColumn & column) const
@ -521,23 +547,17 @@ MutableColumnPtr DataTypeArray::getSubcolumnImpl(const String & subcolumn_name,
{
auto & column_array = assert_cast<ColumnArray &>(column);
if (subcolumn_name == "size" + std::to_string(level))
return column_array.getOffsetsPtr()->assumeMutable();
return getArraySizesPositionIndependent(column_array);
MutableColumnPtr subcolumn;
if (const auto * nested_array = typeid_cast<const DataTypeArray *>(nested.get()))
return nested_array->getSubcolumnImpl(subcolumn_name, column, level + 1);
subcolumn = nested_array->getSubcolumnImpl(subcolumn_name, column_array.getData(), level + 1);
else
subcolumn = nested->getSubcolumn(subcolumn_name, column_array.getData());
auto subcolumn = nested->getSubcolumn(subcolumn_name, column_array.getData());
return ColumnArray::create(std::move(subcolumn), column_array.getOffsetsPtr()->assumeMutable());
}
String DataTypeArray::getEscapedFileName(const NameAndTypePair & column) const
{
if (column.isSubcolumn())
return escapeForFileName(column.getStorageName()) + "." + column.getSubcolumnName();
return escapeForFileName(column.name);
}
size_t DataTypeArray::getNumberOfDimensions() const
{
const DataTypeArray * nested_array = typeid_cast<const DataTypeArray *>(nested.get());

View File

@ -13,10 +13,12 @@ private:
/// The type of array elements.
DataTypePtr nested;
size_t nested_level = 0;
public:
static constexpr bool is_parametric = true;
DataTypeArray(const DataTypePtr & nested_);
DataTypeArray(const DataTypePtr & nested_, size_t nested_level_ = 0);
TypeIndex getTypeId() const override { return TypeIndex::Array; }
@ -35,6 +37,8 @@ public:
return false;
}
size_t getNestedLevel() const override { return nested_level; }
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
void deserializeBinary(Field & field, ReadBuffer & istr) const override;
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override;
@ -111,11 +115,9 @@ public:
return nested->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion();
}
DataTypePtr getSubcolumnType(const String & subcolumn_name) const override;
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override;
MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const override;
String getEscapedFileName(const NameAndTypePair & column) const override;
const DataTypePtr & getNestedType() const { return nested; }
/// 1 for plain array, 2 for array of arrays and so on.
@ -123,6 +125,7 @@ public:
private:
MutableColumnPtr getSubcolumnImpl(const String & subcolumn_name, IColumn & column, size_t level) const;
DataTypePtr tryGetSubcolumnTypeImpl(const String & subcolumn_name, size_t level) const;
};
}

View File

@ -23,7 +23,6 @@ public:
TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; }
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
DataTypePtr getSubcolumnType(const String & /* subcolumn_name */) const override { return shared_from_this(); }
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -527,13 +527,12 @@ bool DataTypeNullable::equals(const IDataType & rhs) const
return rhs.isNullable() && nested_data_type->equals(*static_cast<const DataTypeNullable &>(rhs).nested_data_type);
}
DataTypePtr DataTypeNullable::getSubcolumnType(const String & subcolumn_name) const
DataTypePtr DataTypeNullable::tryGetSubcolumnType(const String & subcolumn_name) const
{
std::cerr << "(DataTypeNullable::getSubcolumnType) subcolumn_name: " << subcolumn_name << "\n";
if (subcolumn_name == "null")
return std::make_shared<DataTypeUInt8>();
return nested_data_type->getSubcolumnType(subcolumn_name);
return nested_data_type->tryGetSubcolumnType(subcolumn_name);
}
MutableColumnPtr DataTypeNullable::getSubcolumn(const String & subcolumn_name, IColumn & column) const
@ -545,14 +544,6 @@ MutableColumnPtr DataTypeNullable::getSubcolumn(const String & subcolumn_name, I
return nested_data_type->getSubcolumn(subcolumn_name, column_nullable.getNestedColumn());
}
String DataTypeNullable::getEscapedFileName(const NameAndTypePair & column) const
{
if (column.isSubcolumn())
return escapeForFileName(column.getStorageName()) + "." + column.getSubcolumnName();
return escapeForFileName(column.name);
}
static DataTypePtr create(const ASTPtr & arguments)
{

View File

@ -97,9 +97,8 @@ public:
size_t getSizeOfValueInMemory() const override;
bool onlyNull() const override;
bool canBeInsideLowCardinality() const override { return nested_data_type->canBeInsideLowCardinality(); }
DataTypePtr getSubcolumnType(const String & subcolumn_name) const override;
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override;
MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const override;
String getEscapedFileName(const NameAndTypePair & column) const override;
const DataTypePtr & getNestedType() const { return nested_data_type; }

View File

@ -421,14 +421,6 @@ void DataTypeTuple::deserializeBinaryBulkWithMultipleStreams(
settings.path.pop_back();
}
String DataTypeTuple::getEscapedFileName(const NameAndTypePair & column) const
{
if (column.isSubcolumn())
return escapeForFileName(column.getStorageName()) + "%2E" + column.getSubcolumnName();
return escapeForFileName(column.name);
}
void DataTypeTuple::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const
{
for (; value_index < elems.size(); ++value_index)
@ -539,7 +531,7 @@ size_t DataTypeTuple::getSizeOfValueInMemory() const
return res;
}
DataTypePtr DataTypeTuple::getSubcolumnType(const String & subcolumn_name) const
DataTypePtr DataTypeTuple::tryGetSubcolumnType(const String & subcolumn_name) const
{
for (size_t i = 0; i < names.size(); ++i)
{
@ -550,11 +542,11 @@ DataTypePtr DataTypeTuple::getSubcolumnType(const String & subcolumn_name) const
return elems[i];
if (subcolumn_name[name_length] == '.')
return elems[i]->getSubcolumnType(subcolumn_name.substr(name_length + 1));
return elems[i]->tryGetSubcolumnType(subcolumn_name.substr(name_length + 1));
}
}
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
return nullptr;
}
MutableColumnPtr DataTypeTuple::getSubcolumn(const String & subcolumn_name, IColumn & column) const

View File

@ -95,9 +95,8 @@ public:
size_t getMaximumSizeOfValueInMemory() const override;
size_t getSizeOfValueInMemory() const override;
DataTypePtr getSubcolumnType(const String & subcolumn_name) const override;
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const override;
MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const override;
String getEscapedFileName(const NameAndTypePair & column) const override;
const DataTypes & getElements() const { return elems; }
const Strings & getElementNames() const { return names; }

View File

@ -9,6 +9,7 @@
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypeCustom.h>
#include <DataTypes/NestedUtils.h>
#include <boost/algorithm/string/split.hpp>
namespace DB
{
@ -95,6 +96,10 @@ size_t IDataType::getSizeOfValueInMemory() const
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
{
auto subcolumn_type = tryGetSubcolumnType(subcolumn_name);
if (subcolumn_type)
return subcolumn_type;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
}
@ -106,21 +111,21 @@ MutableColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, IColumn
std::vector<String> IDataType::getSubcolumnNames() const
{
std::vector<String> res;
enumerateStreams([&res](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
enumerateStreams([&res, this](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */)
{
auto subcolumn_name = IDataType::getSubcolumnNameForStream("", substream_path);
if (!subcolumn_name.empty())
res.push_back(subcolumn_name.substr(1)); // It starts with a dot.
{
subcolumn_name = subcolumn_name.substr(1); // It starts with a dot.
/// Not all of substreams have its subcolumn.
if (tryGetSubcolumnType(subcolumn_name))
res.push_back(subcolumn_name);
}
});
return res;
}
String IDataType::getEscapedFileName(const NameAndTypePair & column) const
{
return escapeForFileName(column.name);
}
static String getNameForSubstreamPath(
String stream_name,
const IDataType::SubstreamPath & path,
@ -144,30 +149,65 @@ static String getNameForSubstreamPath(
return stream_name;
}
static bool isOldStyleNestedSizes(const NameAndTypePair & column, const IDataType::SubstreamPath & path)
{
auto storage_name = column.getStorageName();
auto nested_storage_name = Nested::extractTableName(column.getStorageName());
if (storage_name == nested_storage_name)
return false;
return (path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes) || column.getSubcolumnName() == "size0";
}
static String getDelimiterForSubcolumnPart(const String & subcolumn_part)
{
if (subcolumn_part == "null" || startsWith(subcolumn_part, "size"))
return ".";
return "%2E";
}
/// FIXME: rewrite it.
String IDataType::getFileNameForStream(const NameAndTypePair & column, const IDataType::SubstreamPath & path)
{
if (!column.isSubcolumn())
return getFileNameForStream(column.name, path);
auto storage_name = column.getStorageName();
if (isOldStyleNestedSizes(column, path))
storage_name = Nested::extractTableName(storage_name);
String storage_name = column.getStorageName();
String nested_table_name = Nested::extractTableName(storage_name);
auto stream_name = escapeForFileName(storage_name);
auto subcolumn_name = column.getSubcolumnName();
bool is_sizes_of_nested_type =
(path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes
&& nested_table_name != storage_name) || column.getSubcolumnName() == "size0";
String stream_name;
if (is_sizes_of_nested_type)
if (!subcolumn_name.empty())
{
if (column.getSubcolumnName() == "size0")
return escapeForFileName(nested_table_name) + ".size0";
std::vector<String> subcolumn_parts;
boost::split(subcolumn_parts, subcolumn_name, [](char c) { return c == '.'; });
stream_name = escapeForFileName(Nested::extractTableName(storage_name));
size_t current_nested_level = 0;
for (const auto & elem : path)
{
if (elem.type == Substream::ArrayElements && elem.is_part_of_nested)
{
++current_nested_level;
}
else if (elem.type == Substream::ArraySizes)
{
size_t nested_level = column.type->getNestedLevel();
for (size_t i = 0; i < nested_level - current_nested_level; ++i)
{
if (subcolumn_parts.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get substream name for column {}."
" Not enough subcolumn parts. Needed: {}", column.name, nested_level - current_nested_level);
subcolumn_parts.pop_back();
}
}
}
for (const auto & subcolumn_part : subcolumn_parts)
stream_name += getDelimiterForSubcolumnPart(subcolumn_part) + escapeForFileName(subcolumn_part);
}
else
stream_name = column.getStorageType()->getEscapedFileName(column);
return getNameForSubstreamPath(std::move(stream_name), path, "%2E");
}

View File

@ -37,7 +37,7 @@ struct NameAndTypePair;
*
* DataType is totally immutable object. You can always share them.
*/
class IDataType : private boost::noncopyable, public std::enable_shared_from_this<IDataType>
class IDataType : private boost::noncopyable
{
public:
IDataType();
@ -101,6 +101,8 @@ public:
/// Index of tuple element, starting at 1.
String tuple_element_name;
bool is_part_of_nested = false;
Substream(Type type_) : type(type_) {}
};
@ -115,7 +117,8 @@ public:
void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
virtual DataTypePtr getSubcolumnType(const String & subcolumn_name) const;
virtual DataTypePtr tryGetSubcolumnType(const String & /* subcolumn_name */) const { return nullptr; }
DataTypePtr getSubcolumnType(const String & subcolumn_name) const;
virtual MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const;
std::vector<String> getSubcolumnNames() const;
@ -446,7 +449,7 @@ public:
/// Strings, Numbers, Date, DateTime, Nullable
virtual bool canBeInsideLowCardinality() const { return false; }
virtual String getEscapedFileName(const NameAndTypePair & column) const;
virtual size_t getNestedLevel() const { return 0; }
/// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column.
static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint);

View File

@ -523,7 +523,6 @@ void ColumnsDescription::addSubcolumns(const String & storage_name, const DataTy
{
for (const auto & subcolumn_name : storage_type->getSubcolumnNames())
{
std::cerr << "storage_name: " << storage_name << ", subcolumn_name: " << subcolumn_name << "\n";
auto subcolumn = NameAndTypePair(storage_name, subcolumn_name,
storage_type, storage_type->getSubcolumnType(subcolumn_name));

View File

@ -1,6 +1,6 @@
SELECT '====array====';
DROP TABLE IF EXISTS t_arr;
CREATE TABLE t_arr (a Array(UInt32)) ENGINE = MergeTree ORDER BY tuple();
CREATE TABLE t_arr (a Array(UInt32)) ENGINE = MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0;
INSERT INTO t_arr VALUES ([1]) ([]) ([1, 2, 3]) ([1, 2]);
SYSTEM DROP MARK CACHE;
@ -14,7 +14,7 @@ WHERE (type = 'QueryFinish') AND (lower(query) LIKE lower('SELECT a.size0 FROM %
SELECT '====tuple====';
DROP TABLE IF EXISTS t_tup;
CREATE TABLE t_tup (t Tuple(s String, u UInt32)) ENGINE = MergeTree ORDER BY tuple();
CREATE TABLE t_tup (t Tuple(s String, u UInt32)) ENGINE = MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0;
INSERT INTO t_tup VALUES (('foo', 1)) (('bar', 2)) (('baz', 42));
SYSTEM DROP MARK CACHE;
@ -31,7 +31,7 @@ WHERE (type = 'QueryFinish') AND (lower(query) LIKE lower('SELECT t._ FROM %t_tu
SELECT '====nullable====';
DROP TABLE IF EXISTS t_nul;
CREATE TABLE t_nul (n Nullable(UInt32)) ENGINE = MergeTree ORDER BY tuple();
CREATE TABLE t_nul (n Nullable(UInt32)) ENGINE = MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0;
INSERT INTO t_nul VALUES (1) (NULL) (2) (NULL);
SYSTEM DROP MARK CACHE;

View File

@ -0,0 +1,10 @@
([1,NULL],2,'a') ['foo',NULL,'bar'] [['123'],['456','789']] qqqq ['zzz','xxx'] [42,43]
[1,NULL] 2 a ['zzz','xxx'] [42,43] qqqq
2 [0,1] 2 a 0
1 3
[['123'],['456','789']] 2 [1,2] [[0],[0,0]]
([1,NULL],2,'a') ['foo',NULL,'bar'] [['123'],['456','789']] qqqq ['zzz','xxx'] [42,43]
[1,NULL] 2 a ['zzz','xxx'] [42,43] qqqq
2 [0,1] 2 a 0
1 3
[['123'],['456','789']] 2 [1,2] [[0],[0,0]]

View File

@ -9,7 +9,7 @@ CREATE TABLE subcolumns
s Nullable(String)
),
arr Array(Nullable(String)),
arr2 Array(Array(LowCardinality(Nullable(String)))),
arr2 Array(Array(Nullable(String))),
lc LowCardinality(String),
nested Nested(col1 String, col2 Nullable(UInt32))
)
@ -19,9 +19,10 @@ INSERT INTO subcolumns VALUES (([1, NULL], 2, 'a'), ['foo', NULL, 'bar'], [['123
SELECT * FROM subcolumns;
SELECT t.a, t.u, t.s, nested.col1, nested.col2, lc FROM subcolumns;
SELECT t.a.size0, t.a.null, t.u, t.s, t.s.null FROM subcolumns;
-- SELECT arr2, arr2.size0, arr2.size1, arr2.null FROM subcolumns;
-- SELECT nested.col1, nested.col2, nested.col1.size0, nested.col2.size0, nested.col2.null FROM subcolumns;
SELECT sumArray(arr.null), sum(arr.size0) FROM subcolumns;
SELECT arr2, arr2.size0, arr2.size1, arr2.null FROM subcolumns;
-- SELECT nested.col1, nested.col2, nested.col1.size0, nested.col2.size0, nested.col2.null FROM subcolumns;
DROP TABLE IF EXISTS subcolumns;
CREATE TABLE subcolumns
@ -33,7 +34,7 @@ CREATE TABLE subcolumns
s Nullable(String)
),
arr Array(Nullable(String)),
arr2 Array(Array(LowCardinality(Nullable(String)))),
arr2 Array(Array(Nullable(String))),
lc LowCardinality(String),
nested Nested(col1 String, col2 Nullable(UInt32))
)
@ -43,6 +44,6 @@ INSERT INTO subcolumns VALUES (([1, NULL], 2, 'a'), ['foo', NULL, 'bar'], [['123
SELECT * FROM subcolumns;
SELECT t.a, t.u, t.s, nested.col1, nested.col2, lc FROM subcolumns;
SELECT t.a.size0, t.a.null, t.u, t.s, t.s.null FROM subcolumns;
-- SELECT arr2, arr2.size0, arr2.size1, arr2.null FROM subcolumns;
-- SELECT nested.col1, nested.col2, nested.col1.size0, nested.col2.size0, nested.col2.null FROM subcolumns;
SELECT sumArray(arr.null), sum(arr.size0) FROM subcolumns;
SELECT arr2, arr2.size0, arr2.size1, arr2.null FROM subcolumns;
-- SELECT nested.col1, nested.col2, nested.size0, nested.size0, nested.col2.null FROM subcolumns;

View File

@ -0,0 +1,23 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh
set -e
create_query="CREATE TABLE subcolumns(n Nullable(UInt32), a1 Array(UInt32),\
a2 Array(Array(Array(UInt32))), a3 Array(Nullable(UInt32)), t Tuple(s String, v UInt32))"
declare -a ENGINES=("Log" "StripeLog" "TinyLog" "Memory" \
"MergeTree ORDER BY tuple() SETTINGS min_bytes_for_compact_part='10M'"
"MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part='10M'"
"MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part=0")
for engine in "${ENGINES[@]}"; do
echo $engine
$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS subcolumns"
$CLICKHOUSE_CLIENT --query "$create_query ENGINE = $engine"
$CLICKHOUSE_CLIENT --query "INSERT INTO subcolumns VALUES (100, [1, 2, 3], [[[1, 2], [], [4]], [[5, 6], [7, 8]], [[]]], [1, NULL, 2], ('foo', 200))"
$CLICKHOUSE_CLIENT --query "SELECT * FROM subcolumns"
$CLICKHOUSE_CLIENT --query "SELECT n, n.null, a1, a1.size0, a2, a2.size0, a2.size1, a2.size2, a3, a3.size0, a3.null, t, t.s, t.v FROM subcolumns"
done