mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-28 18:42:26 +00:00
dynamic subcolumns: new format and several fixes
This commit is contained in:
parent
205a23282b
commit
6a5daca135
@ -7,7 +7,9 @@
|
|||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
#include <Interpreters/castColumn.h>
|
#include <Interpreters/castColumn.h>
|
||||||
|
|
||||||
#include <Common/FieldVisitors.h>
|
#include <boost/algorithm/string/split.hpp>
|
||||||
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -169,6 +171,17 @@ Names ColumnObject::getKeys() const
|
|||||||
return keys;
|
return keys;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isPrefix(const Strings & prefix, const Strings & strings)
|
||||||
|
{
|
||||||
|
if (prefix.size() > strings.size())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < prefix.size(); ++i)
|
||||||
|
if (prefix[i] != strings[i])
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void ColumnObject::optimizeTypesOfSubcolumns()
|
void ColumnObject::optimizeTypesOfSubcolumns()
|
||||||
{
|
{
|
||||||
if (optimized_types_of_subcolumns)
|
if (optimized_types_of_subcolumns)
|
||||||
@ -184,11 +197,20 @@ void ColumnObject::optimizeTypesOfSubcolumns()
|
|||||||
if (isNothing(getBaseTypeOfArray(to_type)))
|
if (isNothing(getBaseTypeOfArray(to_type)))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
auto it = std::find_if(subcolumns.begin(), subcolumns.end(),
|
Strings name_parts;
|
||||||
[&name = name](const auto & elem) { return elem.first.size() > name.size() && startsWith(elem.first, name); });
|
boost::split(name_parts, name, boost::is_any_of("."));
|
||||||
|
|
||||||
if (it != subcolumns.end())
|
for (const auto & [other_name, _] : subcolumns)
|
||||||
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Data in Object has ambiguous paths: '{}' and '{}", name, it->first);
|
{
|
||||||
|
if (other_name.size() > name.size())
|
||||||
|
{
|
||||||
|
Strings other_name_parts;
|
||||||
|
boost::split(other_name_parts, other_name, boost::is_any_of("."));
|
||||||
|
|
||||||
|
if (isPrefix(name_parts, other_name_parts))
|
||||||
|
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Data in Object has ambiguous paths: '{}' and '{}", name, other_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (to_type->equals(*from_type))
|
if (to_type->equals(*from_type))
|
||||||
{
|
{
|
||||||
|
@ -151,7 +151,7 @@ double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const
|
|||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
||||||
size_t step = num_rows / num_sampled_rows;
|
size_t step = num_rows / num_sampled_rows;
|
||||||
std::uniform_int_distribution<size_t> dist(1, step);
|
std::uniform_int_distribution<size_t> dist(0, step - 1);
|
||||||
|
|
||||||
size_t res = 0;
|
size_t res = 0;
|
||||||
for (size_t i = 0; i < num_rows; i += step)
|
for (size_t i = 0; i < num_rows; i += step)
|
||||||
|
@ -109,7 +109,7 @@ DataTypePtr FieldToDataType::operator() (const Array & x) const
|
|||||||
element_types.reserve(x.size());
|
element_types.reserve(x.size());
|
||||||
|
|
||||||
for (const Field & elem : x)
|
for (const Field & elem : x)
|
||||||
element_types.emplace_back(applyVisitor(FieldToDataType(), elem));
|
element_types.emplace_back(applyVisitor(FieldToDataType(allow_convertion_to_string), elem));
|
||||||
|
|
||||||
return std::make_shared<DataTypeArray>(getLeastSupertype(element_types, allow_convertion_to_string));
|
return std::make_shared<DataTypeArray>(getLeastSupertype(element_types, allow_convertion_to_string));
|
||||||
}
|
}
|
||||||
@ -124,7 +124,7 @@ DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const
|
|||||||
element_types.reserve(ext::size(tuple));
|
element_types.reserve(ext::size(tuple));
|
||||||
|
|
||||||
for (const auto & element : tuple)
|
for (const auto & element : tuple)
|
||||||
element_types.push_back(applyVisitor(FieldToDataType(), element));
|
element_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), element));
|
||||||
|
|
||||||
return std::make_shared<DataTypeTuple>(element_types);
|
return std::make_shared<DataTypeTuple>(element_types);
|
||||||
}
|
}
|
||||||
@ -140,8 +140,8 @@ DataTypePtr FieldToDataType::operator() (const Map & map) const
|
|||||||
{
|
{
|
||||||
const auto & tuple = elem.safeGet<const Tuple &>();
|
const auto & tuple = elem.safeGet<const Tuple &>();
|
||||||
assert(tuple.size() == 2);
|
assert(tuple.size() == 2);
|
||||||
key_types.push_back(applyVisitor(FieldToDataType(), tuple[0]));
|
key_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[0]));
|
||||||
value_types.push_back(applyVisitor(FieldToDataType(), tuple[1]));
|
value_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::make_shared<DataTypeMap>(
|
return std::make_shared<DataTypeMap>(
|
||||||
|
@ -28,6 +28,7 @@ namespace ErrorCodes
|
|||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class FieldVisitorReplaceNull : public StaticVisitor<Field>
|
class FieldVisitorReplaceNull : public StaticVisitor<Field>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
@ -15,6 +15,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory);
|
|||||||
void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory);
|
void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory);
|
||||||
void registerFileSegmentationEngineRegexp(FormatFactory & factory);
|
void registerFileSegmentationEngineRegexp(FormatFactory & factory);
|
||||||
void registerFileSegmentationEngineJSONAsString(FormatFactory & factory);
|
void registerFileSegmentationEngineJSONAsString(FormatFactory & factory);
|
||||||
|
void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory);
|
||||||
|
|
||||||
/// Formats for both input/output.
|
/// Formats for both input/output.
|
||||||
|
|
||||||
@ -76,6 +77,7 @@ void registerOutputFormatProcessorPostgreSQLWire(FormatFactory & factory);
|
|||||||
|
|
||||||
void registerInputFormatProcessorRegexp(FormatFactory & factory);
|
void registerInputFormatProcessorRegexp(FormatFactory & factory);
|
||||||
void registerInputFormatProcessorJSONAsString(FormatFactory & factory);
|
void registerInputFormatProcessorJSONAsString(FormatFactory & factory);
|
||||||
|
void registerInputFormatProcessorJSONAsObject(FormatFactory & factory);
|
||||||
void registerInputFormatProcessorLineAsString(FormatFactory & factory);
|
void registerInputFormatProcessorLineAsString(FormatFactory & factory);
|
||||||
void registerInputFormatProcessorCapnProto(FormatFactory & factory);
|
void registerInputFormatProcessorCapnProto(FormatFactory & factory);
|
||||||
|
|
||||||
@ -89,6 +91,7 @@ void registerFormats()
|
|||||||
registerFileSegmentationEngineJSONEachRow(factory);
|
registerFileSegmentationEngineJSONEachRow(factory);
|
||||||
registerFileSegmentationEngineRegexp(factory);
|
registerFileSegmentationEngineRegexp(factory);
|
||||||
registerFileSegmentationEngineJSONAsString(factory);
|
registerFileSegmentationEngineJSONAsString(factory);
|
||||||
|
registerFileSegmentationEngineJSONAsObject(factory);
|
||||||
|
|
||||||
registerInputFormatNative(factory);
|
registerInputFormatNative(factory);
|
||||||
registerOutputFormatNative(factory);
|
registerOutputFormatNative(factory);
|
||||||
@ -147,6 +150,7 @@ void registerFormats()
|
|||||||
|
|
||||||
registerInputFormatProcessorRegexp(factory);
|
registerInputFormatProcessorRegexp(factory);
|
||||||
registerInputFormatProcessorJSONAsString(factory);
|
registerInputFormatProcessorJSONAsString(factory);
|
||||||
|
registerInputFormatProcessorJSONAsObject(factory);
|
||||||
registerInputFormatProcessorLineAsString(factory);
|
registerInputFormatProcessorLineAsString(factory);
|
||||||
|
|
||||||
#if !defined(ARCADIA_BUILD)
|
#if !defined(ARCADIA_BUILD)
|
||||||
|
@ -452,7 +452,7 @@ void getArrayJoinedColumns(ASTPtr & query, TreeRewriterResult & result, const AS
|
|||||||
bool found = false;
|
bool found = false;
|
||||||
for (const auto & column : source_columns)
|
for (const auto & column : source_columns)
|
||||||
{
|
{
|
||||||
auto split = Nested::splitName(column.name);
|
auto split = Nested::splitName(column.name, /*reverse=*/ true);
|
||||||
if (split.first == source_name && !split.second.empty())
|
if (split.first == source_name && !split.second.empty())
|
||||||
{
|
{
|
||||||
result.array_join_result_to_source[Nested::concatenateName(result_name, split.second)] = column.name;
|
result.array_join_result_to_source[Nested::concatenateName(result_name, split.second)] = column.name;
|
||||||
|
84
src/Processors/Formats/Impl/JSONAsObjectRowInputFormat.cpp
Normal file
84
src/Processors/Formats/Impl/JSONAsObjectRowInputFormat.cpp
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
#include <Processors/Formats/IRowInputFormat.h>
|
||||||
|
#include <Formats/FormatSettings.h>
|
||||||
|
#include <Formats/FormatFactory.h>
|
||||||
|
#include <Formats/JSONEachRowUtils.h>
|
||||||
|
#include <Common/assert_cast.h>
|
||||||
|
#include <IO/ReadHelpers.h>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int BAD_ARGUMENTS;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
class JSONAsObjectRowInputFormat : public IRowInputFormat
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
JSONAsObjectRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_);
|
||||||
|
|
||||||
|
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
|
||||||
|
String getName() const override { return "JSONAsObjectRowInputFormat"; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const FormatSettings format_settings;
|
||||||
|
};
|
||||||
|
|
||||||
|
JSONAsObjectRowInputFormat::JSONAsObjectRowInputFormat(
|
||||||
|
ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_)
|
||||||
|
: IRowInputFormat(header_, in_, std::move(params_))
|
||||||
|
, format_settings(format_settings_)
|
||||||
|
{
|
||||||
|
if (header_.columns() != 1)
|
||||||
|
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||||
|
"Input format JSONAsObject is only suitable for tables with a single column of type Object but the number of columns is {}",
|
||||||
|
header_.columns());
|
||||||
|
|
||||||
|
if (!isObject(header_.getByPosition(0).type))
|
||||||
|
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||||
|
"Input format JSONAsObject is only suitable for tables with a single column of type Object but the column type is {}",
|
||||||
|
header_.getByPosition(0).type->getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool JSONAsObjectRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
|
||||||
|
{
|
||||||
|
assert(serializations.size() == 1);
|
||||||
|
assert(columns.size() == 1);
|
||||||
|
|
||||||
|
skipWhitespaceIfAny(in);
|
||||||
|
if (!in.eof())
|
||||||
|
serializations[0]->deserializeTextJSON(*columns[0], in, format_settings);
|
||||||
|
|
||||||
|
skipWhitespaceIfAny(in);
|
||||||
|
if (!in.eof() && *in.position() == ',')
|
||||||
|
++in.position();
|
||||||
|
skipWhitespaceIfAny(in);
|
||||||
|
|
||||||
|
return !in.eof();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void registerInputFormatProcessorJSONAsObject(FormatFactory & factory)
|
||||||
|
{
|
||||||
|
factory.registerInputFormatProcessor("JSONAsObject", [](
|
||||||
|
ReadBuffer & buf,
|
||||||
|
const Block & sample,
|
||||||
|
IRowInputFormat::Params params,
|
||||||
|
const FormatSettings & settings)
|
||||||
|
{
|
||||||
|
return std::make_shared<JSONAsObjectRowInputFormat>(buf, sample, std::move(params), settings);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory)
|
||||||
|
{
|
||||||
|
factory.registerFileSegmentationEngine("JSONAsObject", &fileSegmentationEngineJSONEachRowImpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,40 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
|
||||||
# shellcheck source=../shell_config.sh
|
|
||||||
. "$CUR_DIR"/../shell_config.sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json"
|
|
||||||
$CLICKHOUSE_CLIENT -q "CREATE TABLE t_json(id UInt64, data Object('JSON')) \
|
|
||||||
ENGINE = MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0"
|
|
||||||
|
|
||||||
cat << EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json FORMAT CSV"
|
|
||||||
1,{"k1":"aa","k2":{"k3":"bb","k4":"c"}}
|
|
||||||
2,{"k1":"ee","k5":"ff"}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
cat << EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json FORMAT CSV"
|
|
||||||
3,{"k5":"foo"}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
$CLICKHOUSE_CLIENT -q "SELECT id, data.k1, data.k2.k3, data.k2.k4, data.k5 FROM t_json ORDER BY id"
|
|
||||||
$CLICKHOUSE_CLIENT -q "SELECT name, column, type \
|
|
||||||
FROM system.parts_columns WHERE table = 't_json' AND database = '$CLICKHOUSE_DATABASE' ORDER BY name"
|
|
||||||
|
|
||||||
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json"
|
|
||||||
|
|
||||||
$CLICKHOUSE_CLIENT -q "CREATE TABLE t_json(id UInt64, data Object('JSON')) \
|
|
||||||
ENGINE = MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0"
|
|
||||||
|
|
||||||
cat << EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json FORMAT CSV"
|
|
||||||
1,{"k1":[{"k2":"aaa","k3":[{"k4":"bbb"},{"k4":"ccc"}]},{"k2":"ddd","k3":[{"k4":"eee"},{"k4":"fff"}]}]}
|
|
||||||
EOF
|
|
||||||
|
|
||||||
$CLICKHOUSE_CLIENT -q "SELECT id, data.k1.k2, data.k1.k3.k4 FROM t_json ORDER BY id"
|
|
||||||
|
|
||||||
$CLICKHOUSE_CLIENT -q "SELECT name, column, type \
|
|
||||||
FROM system.parts_columns WHERE table = 't_json' AND database = '$CLICKHOUSE_DATABASE' ORDER BY name"
|
|
||||||
|
|
||||||
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json"
|
|
Loading…
Reference in New Issue
Block a user