Fix special builds and tests

This commit is contained in:
avogar 2024-07-22 14:28:07 +00:00
parent 99026efcdc
commit 2626880f6e
34 changed files with 178 additions and 237 deletions

View File

@ -18,13 +18,13 @@ namespace ErrorCodes
namespace namespace
{ {
static const FormatSettings & getFormatSettings() const FormatSettings & getFormatSettings()
{ {
static const FormatSettings settings; static const FormatSettings settings;
return settings; return settings;
} }
static const std::shared_ptr<SerializationDynamic> & getDynamicSerialization() const std::shared_ptr<SerializationDynamic> & getDynamicSerialization()
{ {
static const std::shared_ptr<SerializationDynamic> dynamic_serialization = std::make_shared<SerializationDynamic>(); static const std::shared_ptr<SerializationDynamic> dynamic_serialization = std::make_shared<SerializationDynamic>();
return dynamic_serialization; return dynamic_serialization;
@ -394,7 +394,7 @@ void ColumnObject::doInsertFrom(const IColumn & src, size_t n)
const auto & src_object_column = assert_cast<const ColumnObject &>(src); const auto & src_object_column = assert_cast<const ColumnObject &>(src);
/// First, insert typed paths, they must be the same for both columns. /// First, insert typed paths, they must be the same for both columns.
for (auto & [path, column] : src_object_column.typed_paths) for (const auto & [path, column] : src_object_column.typed_paths)
typed_paths[path]->insertFrom(*column, n); typed_paths[path]->insertFrom(*column, n);
/// Second, insert dynamic paths and extend them if needed. /// Second, insert dynamic paths and extend them if needed.
@ -428,7 +428,7 @@ void ColumnObject::doInsertRangeFrom(const IColumn & src, size_t start, size_t l
const auto & src_object_column = assert_cast<const ColumnObject &>(src); const auto & src_object_column = assert_cast<const ColumnObject &>(src);
/// First, insert typed paths, they must be the same for both columns. /// First, insert typed paths, they must be the same for both columns.
for (auto & [path, column] : src_object_column.typed_paths) for (const auto & [path, column] : src_object_column.typed_paths)
typed_paths[path]->insertRangeFrom(*column, start, length); typed_paths[path]->insertRangeFrom(*column, start, length);
/// Second, insert dynamic paths and extend them if needed. /// Second, insert dynamic paths and extend them if needed.
@ -898,9 +898,9 @@ void ColumnObject::ensureOwnership()
size_t ColumnObject::byteSize() const size_t ColumnObject::byteSize() const
{ {
size_t size = 0; size_t size = 0;
for (auto & [_, column] : typed_paths) for (const auto & [_, column] : typed_paths)
size += column->byteSize(); size += column->byteSize();
for (auto & [_, column] : dynamic_paths) for (const auto & [_, column] : dynamic_paths)
size += column->byteSize(); size += column->byteSize();
size += shared_data->byteSize(); size += shared_data->byteSize();
return size; return size;
@ -909,9 +909,9 @@ size_t ColumnObject::byteSize() const
size_t ColumnObject::byteSizeAt(size_t n) const size_t ColumnObject::byteSizeAt(size_t n) const
{ {
size_t size = 0; size_t size = 0;
for (auto & [_, column] : typed_paths) for (const auto & [_, column] : typed_paths)
size += column->byteSizeAt(n); size += column->byteSizeAt(n);
for (auto & [_, column] : dynamic_paths) for (const auto & [_, column] : dynamic_paths)
size += column->byteSizeAt(n); size += column->byteSizeAt(n);
size += shared_data->byteSizeAt(n); size += shared_data->byteSizeAt(n);
return size; return size;
@ -920,9 +920,9 @@ size_t ColumnObject::byteSizeAt(size_t n) const
size_t ColumnObject::allocatedBytes() const size_t ColumnObject::allocatedBytes() const
{ {
size_t size = 0; size_t size = 0;
for (auto & [_, column] : typed_paths) for (const auto & [_, column] : typed_paths)
size += column->allocatedBytes(); size += column->allocatedBytes();
for (auto & [_, column] : dynamic_paths) for (const auto & [_, column] : dynamic_paths)
size += column->allocatedBytes(); size += column->allocatedBytes();
size += shared_data->allocatedBytes(); size += shared_data->allocatedBytes();
return size; return size;
@ -1040,9 +1040,9 @@ void ColumnObject::finalize()
bool ColumnObject::isFinalized() const bool ColumnObject::isFinalized() const
{ {
bool finalized = true; bool finalized = true;
for (auto & [_, column] : typed_paths) for (const auto & [_, column] : typed_paths)
finalized &= column->isFinalized(); finalized &= column->isFinalized();
for (auto & [_, column] : dynamic_paths) for (const auto & [_, column] : dynamic_paths)
finalized &= column->isFinalized(); finalized &= column->isFinalized();
finalized &= shared_data->isFinalized(); finalized &= shared_data->isFinalized();
return finalized; return finalized;
@ -1144,8 +1144,8 @@ size_t ColumnObject::findPathLowerBoundInSharedData(StringRef path, const Column
Iterator() = delete; Iterator() = delete;
Iterator(const ColumnString * data_, size_t index_) : data(data_), index(index_) {} Iterator(const ColumnString * data_, size_t index_) : data(data_), index(index_) {}
Iterator(const Iterator & rhs) : data(rhs.data), index(rhs.index) {} Iterator(const Iterator & rhs) = default;
Iterator & operator=(const Iterator & rhs) { data = rhs.data; index = rhs.index; return *this; } Iterator & operator=(const Iterator & rhs) = default;
inline Iterator& operator+=(difference_type rhs) { index += rhs; return *this;} inline Iterator& operator+=(difference_type rhs) { index += rhs; return *this;}
inline StringRef operator*() const {return data->getDataAt(index);} inline StringRef operator*() const {return data->getDataAt(index);}

View File

@ -188,7 +188,7 @@ public:
static void fillPathColumnFromSharedData(IColumn & path_column, StringRef path, const ColumnPtr & shared_data_column, size_t start, size_t end); static void fillPathColumnFromSharedData(IColumn & path_column, StringRef path, const ColumnPtr & shared_data_column, size_t start, size_t end);
private: private:
void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector<String> & dynamic_paths_to_shared_data, size_t start, size_t length); void insertFromSharedDataAndFillRemainingDynamicPaths(const ColumnObject & src_object_column, std::vector<String> & src_dynamic_paths_for_shared_data, size_t start, size_t length);
void serializePathAndValueIntoArena(Arena & arena, const char *& begin, StringRef path, StringRef value, StringRef & res) const; void serializePathAndValueIntoArena(Arena & arena, const char *& begin, StringRef path, StringRef value, StringRef & res) const;
/// Map path -> column for paths with explicitly specified types. /// Map path -> column for paths with explicitly specified types.

View File

@ -196,10 +196,29 @@ MutableColumnPtr DataTypeObject::createColumn() const
namespace namespace
{ {
/// It is possible to have nested JSON object inside Dynamic. For example when we have an array of JSON objects.
/// During type inference in parsing in case of creating nested JSON objects, we reduce max_dynamic_paths/max_dynamic_types by factors
/// NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR/NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR.
/// So the type name will actually be JSON(max_dynamic_paths=N, max_dynamic_types=M). But we want the user to be able to query it
/// using json.array.:`Array(JSON)`.some.path without specifying max_dynamic_paths/max_dynamic_types.
/// To support it, we do a trick - we replace JSON name in subcolumn to JSON(max_dynamic_paths=N, max_dynamic_types=M), because we know
/// the exact values of max_dynamic_paths/max_dynamic_types for it.
void replaceJSONTypeNameIfNeeded(String & type_name, size_t max_dynamic_paths, size_t max_dynamic_types)
{
auto pos = type_name.find("JSON");
while (pos != String::npos)
{
/// Replace only if we don't already have parameters in JSON type declaration.
if (pos + 4 == type_name.size() || type_name[pos + 4] != '(')
type_name.replace(pos, 4, fmt::format("JSON(max_dynamic_paths={}, max_dynamic_types={})", max_dynamic_paths / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR, std::max(max_dynamic_types / DataTypeObject::NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR, 1lu)));
pos = type_name.find("JSON", pos + 4);
}
}
/// JSON subcolumn name with Dynamic type subcolumn looks like this: /// JSON subcolumn name with Dynamic type subcolumn looks like this:
/// "json.some.path.:`Type_name`.some.subcolumn". /// "json.some.path.:`Type_name`.some.subcolumn".
/// We back quoted type name during identifier parsing so we can distinguish type subcolumn and path element ":TypeName". /// We back quoted type name during identifier parsing so we can distinguish type subcolumn and path element ":TypeName".
std::pair<String, String> splitPathAndDynamicTypeSubcolumn(std::string_view subcolumn_name) std::pair<String, String> splitPathAndDynamicTypeSubcolumn(std::string_view subcolumn_name, size_t max_dynamic_paths, size_t max_dynamic_types)
{ {
/// Try to find dynamic type subcolumn in a form .:`Type`. /// Try to find dynamic type subcolumn in a form .:`Type`.
auto pos = subcolumn_name.find(".:`"); auto pos = subcolumn_name.find(".:`");
@ -212,6 +231,8 @@ std::pair<String, String> splitPathAndDynamicTypeSubcolumn(std::string_view subc
if (!tryReadBackQuotedString(dynamic_subcolumn, buf)) if (!tryReadBackQuotedString(dynamic_subcolumn, buf))
return {String(subcolumn_name), ""}; return {String(subcolumn_name), ""};
replaceJSONTypeNameIfNeeded(dynamic_subcolumn, max_dynamic_paths, max_dynamic_types);
/// If there is more data in the buffer - it's subcolumn of a type, append it to the type name. /// If there is more data in the buffer - it's subcolumn of a type, append it to the type name.
if (!buf.eof()) if (!buf.eof())
dynamic_subcolumn += String(buf.position(), buf.available()); dynamic_subcolumn += String(buf.position(), buf.available());
@ -333,7 +354,7 @@ std::unique_ptr<ISerialization::SubstreamData> DataTypeObject::getDynamicSubcolu
} }
/// Split requested subcolumn to the JSON path and Dynamic type subcolumn. /// Split requested subcolumn to the JSON path and Dynamic type subcolumn.
auto [path, path_subcolumn] = splitPathAndDynamicTypeSubcolumn(subcolumn_name); auto [path, path_subcolumn] = splitPathAndDynamicTypeSubcolumn(subcolumn_name, max_dynamic_paths, max_dynamic_types);
std::unique_ptr<SubstreamData> res; std::unique_ptr<SubstreamData> res;
if (auto it = typed_paths.find(path); it != typed_paths.end()) if (auto it = typed_paths.find(path); it != typed_paths.end())
{ {
@ -373,18 +394,6 @@ std::unique_ptr<ISerialization::SubstreamData> DataTypeObject::getDynamicSubcolu
/// Get subcolumn for Dynamic type if needed. /// Get subcolumn for Dynamic type if needed.
if (!path_subcolumn.empty()) if (!path_subcolumn.empty())
{ {
/// It is possible to have nested JSON object inside Dynamic. For example when we have an array of JSON objects.
/// During parsing in case of creating nested JSON objects, we reduce max_dynamic_paths/max_dynamic_types by NESTED_OBJECT_REDUCE_FACTOR factor.
/// So the type name will actually be JSON(max_dynamic_paths=N, max_dynamic_types=M). But we want the user to be able to query it
/// using json.array.:`Array(JSON)`.some.path without specifying max_dynamic_paths/max_dynamic_types.
/// To support it, we do a trick - we replace JSON name in subcolumn to JSON(max_dynamic_paths=N, max_dynamic_types=M), because we know
/// the exact values of max_dynamic_paths/max_dynamic_types for it.
auto pos = path_subcolumn.find("JSON");
/// We want to replace JSON keyword only in the first subcolumn part before the first dot.
auto first_dot_pos = path_subcolumn.find('.');
if (pos != path_subcolumn.npos && (first_dot_pos == path_subcolumn.npos || pos < first_dot_pos))
path_subcolumn.replace(pos, 4, fmt::format("JSON(max_dynamic_paths={}, max_dynamic_types={})", max_dynamic_paths / NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR, std::max(max_dynamic_types / NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR, 1lu)));
res = res->type->getSubcolumnData(path_subcolumn, *res, throw_if_null); res = res->type->getSubcolumnData(path_subcolumn, *res, throw_if_null);
if (!res) if (!res)
return nullptr; return nullptr;

View File

@ -23,7 +23,7 @@ public:
static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR = 4; static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_PATHS_REDUCE_FACTOR = 4;
static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR = 2; static constexpr size_t NESTED_OBJECT_MAX_DYNAMIC_TYPES_REDUCE_FACTOR = 2;
DataTypeObject( explicit DataTypeObject(
const SchemaFormat & schema_format_, const SchemaFormat & schema_format_,
const std::unordered_map<String, DataTypePtr> & typed_paths_ = {}, const std::unordered_map<String, DataTypePtr> & typed_paths_ = {},
const std::unordered_set<String> & paths_to_skip_ = {}, const std::unordered_set<String> & paths_to_skip_ = {},

View File

@ -31,7 +31,7 @@ public:
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
virtual void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;

View File

@ -48,7 +48,7 @@ bool SerializationObject::shouldSkipPath(const String & path) const
if (paths_to_skip.contains(path)) if (paths_to_skip.contains(path))
return true; return true;
auto it = std::lower_bound(sorted_typed_paths.begin(), sorted_typed_paths.end(), path); auto it = std::lower_bound(sorted_paths_to_skip.begin(), sorted_paths_to_skip.end(), path);
if (it != sorted_paths_to_skip.end() && it != sorted_paths_to_skip.begin() && path.starts_with(*std::prev(it))) if (it != sorted_paths_to_skip.end() && it != sorted_paths_to_skip.begin() && path.starts_with(*std::prev(it)))
return true; return true;

View File

@ -101,7 +101,7 @@ private:
{ {
String path; String path;
TypedPathSubcolumnCreator(const String & path_) : path(path_) {} explicit TypedPathSubcolumnCreator(const String & path_) : path(path_) {}
DataTypePtr create(const DataTypePtr & prev) const override { return prev; } DataTypePtr create(const DataTypePtr & prev) const override { return prev; }
ColumnPtr create(const ColumnPtr & prev) const override { return prev; } ColumnPtr create(const ColumnPtr & prev) const override { return prev; }

View File

@ -67,7 +67,6 @@ private:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Text/binary serialization is not implemented for object sub-object subcolumn"); throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Text/binary serialization is not implemented for object sub-object subcolumn");
} }
private:
String path_prefix; String path_prefix;
std::unordered_map<String, SerializationPtr> typed_paths_serializations; std::unordered_map<String, SerializationPtr> typed_paths_serializations;
SerializationPtr dynamic_serialization; SerializationPtr dynamic_serialization;

View File

@ -1389,7 +1389,7 @@ template <typename JSONParser>
class DynamicNode : public JSONExtractTreeNode<JSONParser> class DynamicNode : public JSONExtractTreeNode<JSONParser>
{ {
public: public:
DynamicNode( explicit DynamicNode(
size_t max_dynamic_paths_for_object_ = DataTypeObject::DEFAULT_MAX_SEPARATELY_STORED_PATHS, size_t max_dynamic_paths_for_object_ = DataTypeObject::DEFAULT_MAX_SEPARATELY_STORED_PATHS,
size_t max_dynamic_types_for_object_ = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES) size_t max_dynamic_types_for_object_ = DataTypeDynamic::DEFAULT_MAX_DYNAMIC_TYPES)
: max_dynamic_paths_for_object(max_dynamic_paths_for_object_), max_dynamic_types_for_object(max_dynamic_types_for_object_) : max_dynamic_paths_for_object(max_dynamic_paths_for_object_), max_dynamic_types_for_object(max_dynamic_types_for_object_)
@ -1412,7 +1412,7 @@ public:
} }
auto & variant_column = column_dynamic.getVariantColumn(); auto & variant_column = column_dynamic.getVariantColumn();
auto & variant_info = column_dynamic.getVariantInfo(); const auto & variant_info = column_dynamic.getVariantInfo();
/// First, try to insert element into current variants but with no types conversion. /// First, try to insert element into current variants but with no types conversion.
/// We want to avoid inferring the type on each row, so if we can insert this element into /// We want to avoid inferring the type on each row, so if we can insert this element into
@ -1486,20 +1486,20 @@ private:
switch (element.type()) switch (element.type())
{ {
case ElementType::NULL_VALUE: case ElementType::NULL_VALUE:
return getNullType(); return std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>());
case ElementType::BOOL: case ElementType::BOOL:
return getBoolType(); return DataTypeFactory::instance().get("Bool");
case ElementType::INT64: case ElementType::INT64:
{ {
auto type = getInt64Type(); auto type = std::make_shared<DataTypeInt64>();
if (element.getInt64() < 0) if (element.getInt64() < 0)
json_inference_info.negative_integers.insert(type.get()); json_inference_info.negative_integers.insert(type.get());
return type; return type;
} }
case ElementType::UINT64: case ElementType::UINT64:
return getUInt64Type(); return std::make_shared<DataTypeUInt64>();
case ElementType::DOUBLE: case ElementType::DOUBLE:
return getFloat64Type(); return std::make_shared<DataTypeFloat64>();
case ElementType::STRING: case ElementType::STRING:
{ {
auto data = element.getString(); auto data = element.getString();
@ -1516,7 +1516,7 @@ private:
} }
} }
return getStringType(); return std::make_shared<DataTypeString>();
} }
case ElementType::ARRAY: case ElementType::ARRAY:
{ {
@ -1527,7 +1527,7 @@ private:
types.push_back(elementToDataTypeImpl(value, format_settings, json_inference_info)); types.push_back(elementToDataTypeImpl(value, format_settings, json_inference_info));
if (types.empty()) if (types.empty())
return getEmptyArrayType(); return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
if (checkIfTypesAreEqual(types)) if (checkIfTypesAreEqual(types))
return std::make_shared<DataTypeArray>(types.back()); return std::make_shared<DataTypeArray>(types.back());
@ -1561,51 +1561,6 @@ private:
} }
} }
/// During schema inference we create shared_ptr to the some data types quite a lot.
/// Single creating of such shared_ptr is not expensive, but when it happens on each
/// column on each row, it can be noticeable.
const DataTypePtr & getBoolType() const
{
static const DataTypePtr bool_type = DataTypeFactory::instance().get("Bool");
return bool_type;
}
const DataTypePtr & getStringType() const
{
static const DataTypePtr string_type = std::make_shared<DataTypeString>();
return string_type;
}
const DataTypePtr & getInt64Type() const
{
static const DataTypePtr int64_type = std::make_shared<DataTypeInt64>();
return int64_type;
}
const DataTypePtr & getUInt64Type() const
{
static const DataTypePtr uint64_type = std::make_shared<DataTypeUInt64>();
return uint64_type;
}
const DataTypePtr & getFloat64Type() const
{
static const DataTypePtr float64_type = std::make_shared<DataTypeFloat64>();
return float64_type;
}
const DataTypePtr & getNullType() const
{
static const DataTypePtr null_type = std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>());
return null_type;
}
const DataTypePtr & getEmptyArrayType() const
{
static const DataTypePtr empty_array_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
return empty_array_type;
}
size_t max_dynamic_paths_for_object; size_t max_dynamic_paths_for_object;
size_t max_dynamic_types_for_object; size_t max_dynamic_types_for_object;
@ -1772,7 +1727,7 @@ private:
} }
} }
/// Try to add a new dynamic path. /// Try to add a new dynamic path.
else if (auto dynamic_column = column_object.tryToAddNewDynamicPath(current_path)) else if (auto * dynamic_column = column_object.tryToAddNewDynamicPath(current_path))
{ {
if (!dynamic_node->insertResultToColumn(*dynamic_column, element, insert_settings, format_settings, error)) if (!dynamic_node->insertResultToColumn(*dynamic_column, element, insert_settings, format_settings, error))
{ {

View File

@ -36,63 +36,6 @@ namespace ErrorCodes
namespace namespace
{ {
/// During schema inference we create shared_ptr to the some data types quite a lot.
/// Single creating of such shared_ptr is not expensive, but when it happens on each
/// column on each row, it can be noticeable.
const DataTypePtr & getBoolType()
{
static const DataTypePtr bool_type = DataTypeFactory::instance().get("Bool");
return bool_type;
}
const DataTypePtr & getStringType()
{
static const DataTypePtr string_type = std::make_shared<DataTypeString>();
return string_type;
}
const DataTypePtr & getInt64Type()
{
static const DataTypePtr int64_type = std::make_shared<DataTypeInt64>();
return int64_type;
}
const DataTypePtr & getUInt64Type()
{
static const DataTypePtr uint64_type = std::make_shared<DataTypeUInt64>();
return uint64_type;
}
const DataTypePtr & getFloat64Type()
{
static const DataTypePtr float64_type = std::make_shared<DataTypeFloat64>();
return float64_type;
}
const DataTypePtr & getDateType()
{
static const DataTypePtr date_type = std::make_shared<DataTypeDate>();
return date_type;
}
const DataTypePtr & getDateTime64Type()
{
static const DataTypePtr date_type = std::make_shared<DataTypeDateTime64>(9);
return date_type;
}
const DataTypePtr & getNullType()
{
static const DataTypePtr null_type = std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>());
return null_type;
}
const DataTypePtr & getEmptyArrayType()
{
static const DataTypePtr empty_array_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
return empty_array_type;
}
/// Special data type that represents JSON object as a set of paths and their types. /// Special data type that represents JSON object as a set of paths and their types.
/// It supports merging two JSON objects and creating Named Tuple from itself. /// It supports merging two JSON objects and creating Named Tuple from itself.
/// It's used only for schema inference of Named Tuples from JSON objects. /// It's used only for schema inference of Named Tuples from JSON objects.
@ -265,7 +208,7 @@ namespace
if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty()) if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
{ {
if (use_string_type_for_ambiguous_paths) if (use_string_type_for_ambiguous_paths)
return getStringType(); return std::make_shared<DataTypeString>();
throw Exception( throw Exception(
ErrorCodes::INCORRECT_DATA, ErrorCodes::INCORRECT_DATA,
@ -331,7 +274,7 @@ namespace
bool is_negative = json_info && json_info->negative_integers.contains(type.get()); bool is_negative = json_info && json_info->negative_integers.contains(type.get());
have_negative_integers |= is_negative; have_negative_integers |= is_negative;
if (!is_negative) if (!is_negative)
type = getUInt64Type(); type = std::make_shared<DataTypeUInt64>();
} }
} }
@ -352,7 +295,7 @@ namespace
WhichDataType which(type); WhichDataType which(type);
if (which.isInt64() || which.isUInt64()) if (which.isInt64() || which.isUInt64())
{ {
auto new_type = getFloat64Type(); const auto & new_type = std::make_shared<DataTypeFloat64>();
if (json_info && json_info->numbers_parsed_from_json_strings.erase(type.get())) if (json_info && json_info->numbers_parsed_from_json_strings.erase(type.get()))
json_info->numbers_parsed_from_json_strings.insert(new_type.get()); json_info->numbers_parsed_from_json_strings.insert(new_type.get());
type = new_type; type = new_type;
@ -376,7 +319,7 @@ namespace
for (auto & type : data_types) for (auto & type : data_types)
{ {
if (isDate(type) || isDateTime64(type)) if (isDate(type) || isDateTime64(type))
type = getStringType(); type = std::make_shared<DataTypeString>();
} }
type_indexes.erase(TypeIndex::Date); type_indexes.erase(TypeIndex::Date);
@ -390,7 +333,7 @@ namespace
for (auto & type : data_types) for (auto & type : data_types)
{ {
if (isDate(type)) if (isDate(type))
type = getDateTime64Type(); type = std::make_shared<DataTypeDateTime64>(9);
} }
type_indexes.erase(TypeIndex::Date); type_indexes.erase(TypeIndex::Date);
@ -412,7 +355,7 @@ namespace
if (isNumber(type) if (isNumber(type)
&& (settings.json.read_numbers_as_strings || !json_info && (settings.json.read_numbers_as_strings || !json_info
|| json_info->numbers_parsed_from_json_strings.contains(type.get()))) || json_info->numbers_parsed_from_json_strings.contains(type.get())))
type = getStringType(); type = std::make_shared<DataTypeString>();
} }
updateTypeIndexes(data_types, type_indexes); updateTypeIndexes(data_types, type_indexes);
@ -435,11 +378,11 @@ namespace
if (isBool(type)) if (isBool(type))
{ {
if (have_signed_integers) if (have_signed_integers)
type = getInt64Type(); type = std::make_shared<DataTypeInt64>();
else if (have_unsigned_integers) else if (have_unsigned_integers)
type = getUInt64Type(); type = std::make_shared<DataTypeUInt64>();
else else
type = getFloat64Type(); type = std::make_shared<DataTypeFloat64>();
} }
} }
@ -456,7 +399,7 @@ namespace
for (auto & type : data_types) for (auto & type : data_types)
{ {
if (isBool(type)) if (isBool(type))
type = getStringType(); type = std::make_shared<DataTypeString>();
} }
type_indexes.erase(TypeIndex::UInt8); type_indexes.erase(TypeIndex::UInt8);
@ -606,7 +549,7 @@ namespace
for (auto & type : data_types) for (auto & type : data_types)
{ {
if (isMap(type)) if (isMap(type))
type = getStringType(); type = std::make_shared<DataTypeString>();
} }
type_indexes.erase(TypeIndex::Map); type_indexes.erase(TypeIndex::Map);
@ -856,7 +799,7 @@ namespace
/// Empty array has type Array(Nothing) /// Empty array has type Array(Nothing)
if (nested_types.empty()) if (nested_types.empty())
return getEmptyArrayType(); return std::make_shared<DataTypeArray>(std::make_shared<DataTypeNothing>());
if (checkIfTypesAreEqual(nested_types)) if (checkIfTypesAreEqual(nested_types))
return std::make_shared<DataTypeArray>(std::move(nested_types.back())); return std::make_shared<DataTypeArray>(std::move(nested_types.back()));
@ -969,13 +912,13 @@ namespace
/// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e' /// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e'
/// But, for now it is true /// But, for now it is true
if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional) && has_fractional) if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional) && has_fractional)
return getFloat64Type(); return std::make_shared<DataTypeFloat64>();
Int64 tmp_int; Int64 tmp_int;
buf.position() = number_start; buf.position() = number_start;
if (tryReadIntText(tmp_int, buf)) if (tryReadIntText(tmp_int, buf))
{ {
auto type = getInt64Type(); auto type = std::make_shared<DataTypeInt64>();
if (json_info && tmp_int < 0) if (json_info && tmp_int < 0)
json_info->negative_integers.insert(type.get()); json_info->negative_integers.insert(type.get());
return type; return type;
@ -985,7 +928,7 @@ namespace
UInt64 tmp_uint; UInt64 tmp_uint;
buf.position() = number_start; buf.position() = number_start;
if (tryReadIntText(tmp_uint, buf)) if (tryReadIntText(tmp_uint, buf))
return getUInt64Type(); return std::make_shared<DataTypeUInt64>();
return nullptr; return nullptr;
} }
@ -997,13 +940,13 @@ namespace
PeekableReadBufferCheckpoint checkpoint(peekable_buf); PeekableReadBufferCheckpoint checkpoint(peekable_buf);
if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings, has_fractional) && has_fractional) if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings, has_fractional) && has_fractional)
return getFloat64Type(); return std::make_shared<DataTypeFloat64>();
peekable_buf.rollbackToCheckpoint(/* drop= */ false); peekable_buf.rollbackToCheckpoint(/* drop= */ false);
Int64 tmp_int; Int64 tmp_int;
if (tryReadIntText(tmp_int, peekable_buf)) if (tryReadIntText(tmp_int, peekable_buf))
{ {
auto type = getInt64Type(); auto type = std::make_shared<DataTypeInt64>();
if (json_info && tmp_int < 0) if (json_info && tmp_int < 0)
json_info->negative_integers.insert(type.get()); json_info->negative_integers.insert(type.get());
return type; return type;
@ -1013,11 +956,11 @@ namespace
/// In case of Int64 overflow we can try to infer UInt64. /// In case of Int64 overflow we can try to infer UInt64.
UInt64 tmp_uint; UInt64 tmp_uint;
if (tryReadIntText(tmp_uint, peekable_buf)) if (tryReadIntText(tmp_uint, peekable_buf))
return getUInt64Type(); return std::make_shared<DataTypeUInt64>();
} }
else if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional)) else if (tryReadFloat<is_json>(tmp_float, buf, settings, has_fractional))
{ {
return getFloat64Type(); return std::make_shared<DataTypeFloat64>();
} }
/// This is not a number. /// This is not a number.
@ -1034,7 +977,7 @@ namespace
Int64 tmp_int; Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof()) if (tryReadIntText(tmp_int, buf) && buf.eof())
{ {
auto type = getInt64Type(); auto type = std::make_shared<DataTypeInt64>();
if (json_inference_info && tmp_int < 0) if (json_inference_info && tmp_int < 0)
json_inference_info->negative_integers.insert(type.get()); json_inference_info->negative_integers.insert(type.get());
return type; return type;
@ -1046,7 +989,7 @@ namespace
/// In case of Int64 overflow, try to infer UInt64 /// In case of Int64 overflow, try to infer UInt64
UInt64 tmp_uint; UInt64 tmp_uint;
if (tryReadIntText(tmp_uint, buf) && buf.eof()) if (tryReadIntText(tmp_uint, buf) && buf.eof())
return getUInt64Type(); return std::make_shared<DataTypeUInt64>();
} }
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof. /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
@ -1055,7 +998,7 @@ namespace
Float64 tmp; Float64 tmp;
bool has_fractional; bool has_fractional;
if (tryReadFloat<is_json>(tmp, buf, settings, has_fractional) && buf.eof()) if (tryReadFloat<is_json>(tmp, buf, settings, has_fractional) && buf.eof())
return getFloat64Type(); return std::make_shared<DataTypeFloat64>();
return nullptr; return nullptr;
} }
@ -1079,7 +1022,7 @@ namespace
if constexpr (is_json) if constexpr (is_json)
{ {
if (json_info->is_object_key) if (json_info->is_object_key)
return getStringType(); return std::make_shared<DataTypeString>();
} }
if (auto type = tryInferDateOrDateTimeFromString(field, settings)) if (auto type = tryInferDateOrDateTimeFromString(field, settings))
@ -1097,7 +1040,7 @@ namespace
} }
} }
return getStringType(); return std::make_shared<DataTypeString>();
} }
bool tryReadJSONObject(ReadBuffer & buf, const FormatSettings & settings, DataTypeJSONPaths::Paths & paths, const std::vector<String> & path, JSONInferenceInfo * json_info, size_t depth) bool tryReadJSONObject(ReadBuffer & buf, const FormatSettings & settings, DataTypeJSONPaths::Paths & paths, const std::vector<String> & path, JSONInferenceInfo * json_info, size_t depth)
@ -1254,7 +1197,7 @@ namespace
return std::make_shared<DataTypeObjectDeprecated>("json", true); return std::make_shared<DataTypeObjectDeprecated>("json", true);
if (settings.json.read_objects_as_strings) if (settings.json.read_objects_as_strings)
return getStringType(); return std::make_shared<DataTypeString>();
transformInferredTypesIfNeededImpl<is_json>(value_types, settings, json_info); transformInferredTypesIfNeededImpl<is_json>(value_types, settings, json_info);
if (!checkIfTypesAreEqual(value_types)) if (!checkIfTypesAreEqual(value_types))
@ -1320,15 +1263,15 @@ namespace
/// Bool /// Bool
if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf)) if (checkStringCaseInsensitive("true", buf) || checkStringCaseInsensitive("false", buf))
return getBoolType(); return DataTypeFactory::instance().get("Bool");
/// Null or NaN /// Null or NaN
if (checkCharCaseInsensitive('n', buf)) if (checkCharCaseInsensitive('n', buf))
{ {
if (checkStringCaseInsensitive("ull", buf)) if (checkStringCaseInsensitive("ull", buf))
return getNullType(); return std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>());
else if (checkStringCaseInsensitive("an", buf)) else if (checkStringCaseInsensitive("an", buf))
return getFloat64Type(); return std::make_shared<DataTypeFloat64>();
} }
/// Number /// Number
@ -1385,7 +1328,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
if (!remain_nothing_types && isNothing(data_type) && settings.json.infer_incomplete_types_as_strings) if (!remain_nothing_types && isNothing(data_type) && settings.json.infer_incomplete_types_as_strings)
{ {
data_type = getStringType(); data_type = std::make_shared<DataTypeString>();
return; return;
} }
@ -1402,7 +1345,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
/// If all objects were empty, use type String, so these JSON objects will be read as Strings. /// If all objects were empty, use type String, so these JSON objects will be read as Strings.
if (json_paths->empty() && settings.json.infer_incomplete_types_as_strings) if (json_paths->empty() && settings.json.infer_incomplete_types_as_strings)
{ {
data_type = getStringType(); data_type = std::make_shared<DataTypeString>();
return; return;
} }
@ -1424,7 +1367,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
auto key_type = map_type->getKeyType(); auto key_type = map_type->getKeyType();
/// If all inferred Maps are empty, use type String, so these JSON objects will be read as Strings. /// If all inferred Maps are empty, use type String, so these JSON objects will be read as Strings.
if (isNothing(key_type) && settings.json.infer_incomplete_types_as_strings) if (isNothing(key_type) && settings.json.infer_incomplete_types_as_strings)
key_type = getStringType(); key_type = std::make_shared<DataTypeString>();
auto value_type = map_type->getValueType(); auto value_type = map_type->getValueType();
@ -1501,10 +1444,10 @@ DataTypePtr tryInferJSONNumberFromString(std::string_view field, const FormatSet
DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings) DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)
{ {
if (settings.try_infer_dates && tryInferDate(field)) if (settings.try_infer_dates && tryInferDate(field))
return getDateType(); return std::make_shared<DataTypeDate>();
if (settings.try_infer_datetimes && tryInferDateTime(field, settings)) if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
return getDateTime64Type(); return std::make_shared<DataTypeDateTime64>(9);
return nullptr; return nullptr;
} }

View File

@ -165,6 +165,7 @@ private:
std::vector<String> sorted_dynamic_and_typed_paths; std::vector<String> sorted_dynamic_and_typed_paths;
const auto & typed_path_columns = column_object.getTypedPaths(); const auto & typed_path_columns = column_object.getTypedPaths();
const auto & dynamic_path_columns = column_object.getDynamicPaths(); const auto & dynamic_path_columns = column_object.getDynamicPaths();
sorted_dynamic_and_typed_paths.reserve(typed_path_columns.size() + dynamic_path_columns.size());
for (const auto & [path, _] : typed_path_columns) for (const auto & [path, _] : typed_path_columns)
sorted_dynamic_and_typed_paths.push_back(path); sorted_dynamic_and_typed_paths.push_back(path);
for (const auto & [path, _] : dynamic_path_columns) for (const auto & [path, _] : dynamic_path_columns)

View File

@ -68,6 +68,7 @@ WITH map(
'Map', 'JSON', 'Map', 'JSON',
'Tuple', 'JSON', 'Tuple', 'JSON',
'Object', 'JSON', 'Object', 'JSON',
'JSON', 'JSON',
'String', '{}', 'String', '{}',
'FixedString', '{}') AS native_to_mysql_mapping, 'FixedString', '{}') AS native_to_mysql_mapping,
)", )",

View File

@ -35,27 +35,27 @@ ASTPtr ASTObjectTypeArgument::clone() const
return res; return res;
} }
void ASTObjectTypeArgument::formatImpl(const FormatSettings & parameters, FormatState & state, FormatStateStacked frame) const void ASTObjectTypeArgument::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{ {
if (path_with_type) if (path_with_type)
{ {
path_with_type->formatImpl(parameters, state, frame); path_with_type->formatImpl(settings, state, frame);
} }
else if (parameter) else if (parameter)
{ {
parameter->formatImpl(parameters, state, frame); parameter->formatImpl(settings, state, frame);
} }
else if (skip_path) else if (skip_path)
{ {
std::string indent_str = parameters.one_line ? "" : std::string(4 * frame.indent, ' '); std::string indent_str = settings.one_line ? "" : std::string(4 * frame.indent, ' ');
parameters.ostr << indent_str << "SKIP" << ' '; settings.ostr << indent_str << "SKIP" << ' ';
skip_path->formatImpl(parameters, state, frame); skip_path->formatImpl(settings, state, frame);
} }
else if (skip_path_regexp) else if (skip_path_regexp)
{ {
std::string indent_str = parameters.one_line ? "" : std::string(4 * frame.indent, ' '); std::string indent_str = settings.one_line ? "" : std::string(4 * frame.indent, ' ');
parameters.ostr << indent_str << "SKIP REGEXP" << ' '; settings.ostr << indent_str << "SKIP REGEXP" << ' ';
skip_path_regexp->formatImpl(parameters, state, frame); skip_path_regexp->formatImpl(settings, state, frame);
} }
} }

View File

@ -213,6 +213,9 @@ void MergeTreeReaderWide::addStreams(
ISerialization::StreamCallback callback = [&] (const ISerialization::SubstreamPath & substream_path) ISerialization::StreamCallback callback = [&] (const ISerialization::SubstreamPath & substream_path)
{ {
if (ISerialization::isFictitiousSubcolumn(substream_path, substream_path.size()))
return;
auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(name_and_type, substream_path, data_part_info_for_read->getChecksums()); auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(name_and_type, substream_path, data_part_info_for_read->getChecksums());
/** If data file is missing then we will not try to open it. /** If data file is missing then we will not try to open it.
@ -348,6 +351,9 @@ void MergeTreeReaderWide::prefetchForColumn(
deserializePrefix(serialization, name_and_type, current_task_last_mark, cache, deserialize_states_cache); deserializePrefix(serialization, name_and_type, current_task_last_mark, cache, deserialize_states_cache);
auto callback = [&](const ISerialization::SubstreamPath & substream_path) auto callback = [&](const ISerialization::SubstreamPath & substream_path)
{ {
if (ISerialization::isFictitiousSubcolumn(substream_path, substream_path.size()))
return;
auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(name_and_type, substream_path, data_part_info_for_read->getChecksums()); auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(name_and_type, substream_path, data_part_info_for_read->getChecksums());
if (stream_name && !prefetched_streams.contains(*stream_name)) if (stream_name && !prefetched_streams.contains(*stream_name))

View File

@ -211,6 +211,9 @@ static IMergeTreeDataPart::Checksums checkDataPart(
{ {
get_serialization(column)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) get_serialization(column)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{ {
if (ISerialization::isFictitiousSubcolumn(substream_path, substream_path.size()))
return;
auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(column, substream_path, ".bin", data_part_storage); auto stream_name = IMergeTreeDataPart::getStreamNameForColumn(column, substream_path, ".bin", data_part_storage);
if (!stream_name) if (!stream_name)

View File

@ -1,9 +1,8 @@
('a.b','Int64') ('a.b','Int64')
('a.c','Array(JSON(max_dynamic_types=16, max_dynamic_paths=256))') ('a.c','Array(JSON(max_dynamic_types=16, max_dynamic_paths=256))')
('a.c','Array(Nullable(String))') ('d','Int64')
('e','Array(Nullable(Int64))') ('e','Array(Nullable(Int64))')
('f','Int64') ('f','Int64')
('d','Int64')
{"o":{"a":{"b":"1","c":[{"d":"10","e":["31"]},{"d":"20","e":["63","127"]}]}}} {"o":{"a":{"b":"1","c":[{"d":"10","e":["31"]},{"d":"20","e":["63","127"]}]}}}
{"o":{"a":{"b":"2","c":[]}}} {"o":{"a":{"b":"2","c":[]}}}
{"o":{"a":{"b":"3","c":[{"e":["32"],"f":"20"},{"e":["64","128"],"f":"30"}]}}} {"o":{"a":{"b":"3","c":[{"e":["32"],"f":"20"},{"e":["64","128"],"f":"30"}]}}}

View File

@ -8,8 +8,8 @@ CREATE TABLE t_json_10 (o JSON) ENGINE = Memory;
INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 1, "c": [{"d": 10, "e": [31]}, {"d": 20, "e": [63, 127]}]}} {"a": {"b": 2, "c": []}} INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 1, "c": [{"d": 10, "e": [31]}, {"d": 20, "e": [63, 127]}]}} {"a": {"b": 2, "c": []}}
INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 3, "c": [{"f": 20, "e": [32]}, {"f": 30, "e": [64, 128]}]}} {"a": {"b": 4, "c": []}} INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 3, "c": [{"f": 20, "e": [32]}, {"f": 30, "e": [64, 128]}]}} {"a": {"b": 4, "c": []}}
SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(o)) FROM t_json_10; SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(o)) as path FROM t_json_10 order by path;
SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(o.a.c.:`Array(JSON)`))) FROM t_json_10; SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(o.a.c.:`Array(JSON)`))) as path FROM t_json_10 order by path;
SELECT o FROM t_json_10 ORDER BY o.a.b FORMAT JSONEachRow; SELECT o FROM t_json_10 ORDER BY o.a.b FORMAT JSONEachRow;
SELECT o.a.b, o.a.c.:`Array(JSON)`.d, o.a.c.:`Array(JSON)`.e, o.a.c.:`Array(JSON)`.f FROM t_json_10 ORDER BY o.a.b; SELECT o.a.b, o.a.c.:`Array(JSON)`.d, o.a.c.:`Array(JSON)`.e, o.a.c.:`Array(JSON)`.f FROM t_json_10 ORDER BY o.a.b;

View File

@ -53,10 +53,10 @@ cat <<EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_11 FORMAT JSONAsObject"
} }
EOF EOF
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(obj)) FROM t_json_11;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(obj)) as path FROM t_json_11 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(obj.key_1[]))) FROM t_json_11;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(obj.key_1[]))) as path FROM t_json_11 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(obj.key_1[].key_3[])))) FROM t_json_11;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(obj.key_1[].key_3[])))) as path FROM t_json_11 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(arrayJoin(obj.key_1[].key_3[].key_4[]))))) FROM t_json_11;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(arrayJoin(obj.key_1[].key_3[].key_4[]))))) as path FROM t_json_11 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT obj FROM t_json_11 ORDER BY obj.id FORMAT JSONEachRow" $CLICKHOUSE_CLIENT -q "SELECT obj FROM t_json_11 ORDER BY obj.id FORMAT JSONEachRow"
$CLICKHOUSE_CLIENT -q "SELECT obj.key_1[].key_3 FROM t_json_11 ORDER BY obj.id FORMAT JSONEachRow" $CLICKHOUSE_CLIENT -q "SELECT obj.key_1[].key_3 FROM t_json_11 ORDER BY obj.id FORMAT JSONEachRow"
$CLICKHOUSE_CLIENT -q "SELECT obj.key_1[].key_3[].key_4[].key_5, obj.key_1[].key_3[].key_7 FROM t_json_11 ORDER BY obj.id" $CLICKHOUSE_CLIENT -q "SELECT obj.key_1[].key_3[].key_4[].key_5, obj.key_1[].key_3[].key_7 FROM t_json_11 ORDER BY obj.id"

View File

@ -43,10 +43,10 @@ cat <<EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_12 FORMAT JSONAsObject"
} }
EOF EOF
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(obj)) FROM t_json_12;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(obj)) as path FROM t_json_12 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(obj.key_0[]))) FROM t_json_12;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(obj.key_0[]))) as path FROM t_json_12 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(obj.key_0[].key_1[])))) FROM t_json_12;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(obj.key_0[].key_1[])))) as path FROM t_json_12 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(arrayJoin(obj.key_0[].key_1[].key_3[]))))) FROM t_json_12;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(arrayJoin(obj.key_0[].key_1[].key_3[]))))) as path FROM t_json_12 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT obj FROM t_json_12 ORDER BY obj.id FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 $CLICKHOUSE_CLIENT -q "SELECT obj FROM t_json_12 ORDER BY obj.id FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
$CLICKHOUSE_CLIENT -q "SELECT obj.key_0[].key_1[].key_3[].key_4, obj.key_0[].key_1[].key_3[].key_5, \ $CLICKHOUSE_CLIENT -q "SELECT obj.key_0[].key_1[].key_3[].key_4, obj.key_0[].key_1[].key_3[].key_5, \
obj.key_0[].key_1[].key_3[].key_6, obj.key_0[].key_1[].key_3[].key_7 FROM t_json_12 ORDER BY obj.id" obj.key_0[].key_1[].key_3[].key_6, obj.key_0[].key_1[].key_3[].key_7 FROM t_json_12 ORDER BY obj.id"

View File

@ -36,8 +36,8 @@ cat <<EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_13 FORMAT JSONAsObject"
} }
EOF EOF
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(obj)) FROM t_json_13;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(obj)) as path FROM t_json_13 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(obj.key1[]))) FROM t_json_13;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(obj.key1[]))) as path FROM t_json_13 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT obj FROM t_json_13 ORDER BY obj.id FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 $CLICKHOUSE_CLIENT -q "SELECT obj FROM t_json_13 ORDER BY obj.id FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
$CLICKHOUSE_CLIENT -q "SELECT \ $CLICKHOUSE_CLIENT -q "SELECT \

View File

@ -51,9 +51,9 @@ cat <<EOF | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_6 FORMAT JSONAsObject"
} }
EOF EOF
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(data)) FROM t_json_6;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(data)) as path FROM t_json_6 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(data.out[]))) FROM t_json_6;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(data.out[]))) as path FROM t_json_6 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(data.out[].outputs[])))) FROM t_json_6;" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(data.out[].outputs[])))) as path FROM t_json_6 order by path;"
$CLICKHOUSE_CLIENT -q "SELECT data.key, data.out[].type, data.out[].value, data.out[].outputs[].index, data.out[].outputs[].n FROM t_json_6 ORDER BY data.key" $CLICKHOUSE_CLIENT -q "SELECT data.key, data.out[].type, data.out[].value, data.out[].outputs[].index, data.out[].outputs[].n FROM t_json_6 ORDER BY data.key"
$CLICKHOUSE_CLIENT -q "DROP TABLE t_json_6;" $CLICKHOUSE_CLIENT -q "DROP TABLE t_json_6;"

View File

@ -16,9 +16,9 @@ ${CLICKHOUSE_CLIENT} -q "CREATE TABLE btc (data JSON) ENGINE = MergeTree ORDER B
${CLICKHOUSE_CLIENT} -q "INSERT INTO btc SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/btc_transactions.json', 'JSONAsObject')" ${CLICKHOUSE_CLIENT} -q "INSERT INTO btc SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/btc_transactions.json', 'JSONAsObject')"
${CLICKHOUSE_CLIENT} -q "SELECT count() FROM btc WHERE NOT ignore(*)" ${CLICKHOUSE_CLIENT} -q "SELECT count() FROM btc WHERE NOT ignore(*)"
${CLICKHOUSE_CLIENT} -q "SELECT distinct arrayJoin(JSONAllPathsWithTypes(data)) from btc" ${CLICKHOUSE_CLIENT} -q "SELECT distinct arrayJoin(JSONAllPathsWithTypes(data)) as path from btc order by path"
${CLICKHOUSE_CLIENT} -q "SELECT distinct arrayJoin(JSONAllPathsWithTypes(arrayJoin(data.inputs.:\`Array(JSON)\`))) from btc" ${CLICKHOUSE_CLIENT} -q "SELECT distinct arrayJoin(JSONAllPathsWithTypes(arrayJoin(data.inputs.:\`Array(JSON)\`))) as path from btc order by path"
${CLICKHOUSE_CLIENT} -q "SELECT distinct arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(data.inputs.:\`Array(JSON)\`.prev_out.spending_outpoints.:\`Array(JSON)\`)))) from btc" ${CLICKHOUSE_CLIENT} -q "SELECT distinct arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(data.inputs.:\`Array(JSON)\`.prev_out.spending_outpoints.:\`Array(JSON)\`)))) as path from btc order by path"
${CLICKHOUSE_CLIENT} -q "SELECT avg(data.fee.:Int64), median(data.fee.:Int64) FROM btc" ${CLICKHOUSE_CLIENT} -q "SELECT avg(data.fee.:Int64), median(data.fee.:Int64) FROM btc"

View File

@ -0,0 +1,12 @@
5000
leonardomso/33-js-concepts 3
ytdl-org/youtube-dl 3
Bogdanp/neko 2
bminossi/AllVideoPocsFromHackerOne 2
disclose/diodata 2
Commit 182
chipeo345 119
phanwi346 114
Nicholas Piggin 95
direwolf-github 49
2

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata (data JSON) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'" --allow_experimental_json_type 1
cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata FORMAT JSONAsObject"
${CLICKHOUSE_CLIENT} -q "SELECT count() FROM ghdata WHERE NOT ignore(*)"
${CLICKHOUSE_CLIENT} -q \
"SELECT data.repo.name, count() AS stars FROM ghdata \
WHERE data.type = 'WatchEvent' GROUP BY data.repo.name ORDER BY stars DESC, data.repo.name LIMIT 5"
${CLICKHOUSE_CLIENT} -q \
"SELECT data.payload.commits[].author.name AS name, count() AS c FROM ghdata \
ARRAY JOIN data.payload.commits[].author.name \
GROUP BY name ORDER BY c DESC, name LIMIT 5"
${CLICKHOUSE_CLIENT} -q "SELECT max(data.payload.pull_request.assignees[].size0) FROM ghdata"
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata"

View File

@ -19,8 +19,8 @@ cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO
${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata_2_from_string SELECT data FROM ghdata_2_string" ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata_2_from_string SELECT data FROM ghdata_2_string"
${CLICKHOUSE_CLIENT} -q "SELECT \ ${CLICKHOUSE_CLIENT} -q "SELECT \
(SELECT groupUniqArrayMap(JSONAllPathsWithTypes(data)), sum(cityHash64(toString(data))) FROM ghdata_2_from_string) = \ (SELECT mapSort(groupUniqArrayMap(JSONAllPathsWithTypes(data))), sum(cityHash64(toString(data))) FROM ghdata_2_from_string) = \
(SELECT groupUniqArrayMap(JSONAllPathsWithTypes(data)), sum(cityHash64(toString(data))) FROM ghdata_2)" (SELECT mapSort(groupUniqArrayMap(JSONAllPathsWithTypes(data))), sum(cityHash64(toString(data))) FROM ghdata_2)"
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata_2" ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata_2"
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata_2_string" ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata_2_string"

View File

@ -12,12 +12,12 @@
('results.drb','Int64') ('results.drb','Int64')
('results.fg','Int64') ('results.fg','Int64')
('results.fg3','Int64') ('results.fg3','Int64')
('results.fg3_pct','DateTime64(9)') ('results.fg3_pct','String')
('results.fg3a','Int64') ('results.fg3a','Int64')
('results.fg_pct','DateTime64(9)') ('results.fg_pct','String')
('results.fga','Int64') ('results.fga','Int64')
('results.ft','Int64') ('results.ft','Int64')
('results.ft_pct','DateTime64(9)') ('results.ft_pct','String')
('results.fta','Int64') ('results.fta','Int64')
('results.mp','Int64') ('results.mp','Int64')
('results.orb','Int64') ('results.orb','Int64')
@ -28,7 +28,6 @@
('results.trb','Int64') ('results.trb','Int64')
('score','Int64') ('score','Int64')
('won','Int64') ('won','Int64')
('results.fg3_pct','String')
Boston Celtics 70 Boston Celtics 70
Los Angeles Lakers 64 Los Angeles Lakers 64
Milwaukee Bucks 61 Milwaukee Bucks 61
@ -41,10 +40,10 @@ Atlanta Hawks 55
('fg3','Int64') ('fg3','Int64')
('fg3_pct','String') ('fg3_pct','String')
('fg3a','Int64') ('fg3a','Int64')
('fg_pct','DateTime64(9)') ('fg_pct','String')
('fga','Int64') ('fga','Int64')
('ft','Int64') ('ft','Int64')
('ft_pct','DateTime64(9)') ('ft_pct','String')
('fta','Int64') ('fta','Int64')
('mp','String') ('mp','String')
('orb','Int64') ('orb','Int64')
@ -54,9 +53,6 @@ Atlanta Hawks 55
('stl','Int64') ('stl','Int64')
('tov','Int64') ('tov','Int64')
('trb','Int64') ('trb','Int64')
('fg3_pct','DateTime64(9)')
('fg_pct','String')
('ft_pct','String')
Larry Bird 10 Larry Bird 10
Clyde Drexler 4 Clyde Drexler 4
Alvin Robertson 3 Alvin Robertson 3

View File

@ -14,15 +14,15 @@ ${CLICKHOUSE_CLIENT} -q "CREATE TABLE nbagames (data JSON) ENGINE = MergeTree OR
cat $CUR_DIR/data_json/nbagames_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nbagames FORMAT JSONAsObject" cat $CUR_DIR/data_json/nbagames_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nbagames FORMAT JSONAsObject"
${CLICKHOUSE_CLIENT} -q "SELECT count() FROM nbagames WHERE NOT ignore(*)" ${CLICKHOUSE_CLIENT} -q "SELECT count() FROM nbagames WHERE NOT ignore(*)"
${CLICKHOUSE_CLIENT} -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(data)) from nbagames" ${CLICKHOUSE_CLIENT} -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(data)) as path from nbagames order by path"
${CLICKHOUSE_CLIENT} -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(data.teams[]))) from nbagames" ${CLICKHOUSE_CLIENT} -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(data.teams[]))) as path from nbagames order by path"
${CLICKHOUSE_CLIENT} -q \ ${CLICKHOUSE_CLIENT} -q \
"SELECT teams.name.:String AS name, sum(teams.won.:Int64) AS wins FROM nbagames \ "SELECT teams.name.:String AS name, sum(teams.won.:Int64) AS wins FROM nbagames \
ARRAY JOIN data.teams[] AS teams GROUP BY name \ ARRAY JOIN data.teams[] AS teams GROUP BY name \
ORDER BY wins DESC LIMIT 5;" ORDER BY wins DESC LIMIT 5;"
${CLICKHOUSE_CLIENT} -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(data.teams[].players[])))) from nbagames" ${CLICKHOUSE_CLIENT} -q "SELECT DISTINCT arrayJoin(JSONAllPathsWithTypes(arrayJoin(arrayJoin(data.teams[].players[])))) as path from nbagames order by path"
${CLICKHOUSE_CLIENT} -q \ ${CLICKHOUSE_CLIENT} -q \
"SELECT player, sum(triple_double) AS triple_doubles FROM \ "SELECT player, sum(triple_double) AS triple_doubles FROM \

View File

@ -1,9 +0,0 @@
Tuple(\n a Tuple(\n b Int8,\n c Nested(d Int8, e Array(Int16), f Int8)))
{"o":{"a":{"b":1,"c":[{"d":10,"e":[31],"f":0},{"d":20,"e":[63,127],"f":0}]}}}
{"o":{"a":{"b":2,"c":[]}}}
{"o":{"a":{"b":3,"c":[{"d":0,"e":[32],"f":20},{"d":0,"e":[64,128],"f":30}]}}}
{"o":{"a":{"b":4,"c":[]}}}
1 [10,20] [[31],[63,127]] [0,0]
2 [] [] []
3 [0,0] [[32],[64,128]] [20,30]
4 [] [] []

View File

@ -4,7 +4,7 @@ SET allow_experimental_object_type = 1;
SET output_format_json_named_tuples_as_objects = 1; SET output_format_json_named_tuples_as_objects = 1;
DROP TABLE IF EXISTS t_json_10; DROP TABLE IF EXISTS t_json_10;
CREATE TABLE t_json_10 (o JSON) ENGINE = Memory; CREATE TABLE t_json_10 (o Object('json')) ENGINE = Memory;
INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 1, "c": [{"d": 10, "e": [31]}, {"d": 20, "e": [63, 127]}]}} {"a": {"b": 2, "c": []}} INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 1, "c": [{"d": 10, "e": [31]}, {"d": 20, "e": [63, 127]}]}} {"a": {"b": 2, "c": []}}
INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 3, "c": [{"f": 20, "e": [32]}, {"f": 30, "e": [64, 128]}]}} {"a": {"b": 4, "c": []}} INSERT INTO t_json_10 FORMAT JSONAsObject {"a": {"b": 3, "c": [{"f": 20, "e": [32]}, {"f": 30, "e": [64, 128]}]}} {"a": {"b": 4, "c": []}}

View File

@ -6,7 +6,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CUR_DIR"/../shell_config.sh . "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata" ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata (data JSON) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'" --allow_experimental_object_type 1 ${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata (data Object('json')) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'" --allow_experimental_object_type 1
cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata FORMAT JSONAsObject" cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata FORMAT JSONAsObject"

View File

@ -39,7 +39,7 @@ ${CLICKHOUSE_CLIENT} -q \
GROUP BY player ORDER BY triple_doubles DESC, player LIMIT 5" GROUP BY player ORDER BY triple_doubles DESC, player LIMIT 5"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE nbagames_string (data String) ENGINE = MergeTree ORDER BY tuple()" ${CLICKHOUSE_CLIENT} -q "CREATE TABLE nbagames_string (data String) ENGINE = MergeTree ORDER BY tuple()"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE nbagames_from_string (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 ${CLICKHOUSE_CLIENT} -q "CREATE TABLE nbagames_from_string (data Object('json')) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1
cat $CUR_DIR/data_json/nbagames_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nbagames_string FORMAT JSONAsString" cat $CUR_DIR/data_json/nbagames_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nbagames_string FORMAT JSONAsString"
${CLICKHOUSE_CLIENT} -q "INSERT INTO nbagames_from_string SELECT data FROM nbagames_string" ${CLICKHOUSE_CLIENT} -q "INSERT INTO nbagames_from_string SELECT data FROM nbagames_string"

View File

@ -6,7 +6,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh . "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json_async_insert" $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json_async_insert"
$CLICKHOUSE_CLIENT --allow_experimental_object_type=1 -q "CREATE TABLE t_json_async_insert (data Object(''json'')) ENGINE = MergeTree ORDER BY tuple()" $CLICKHOUSE_CLIENT --allow_experimental_object_type=1 -q "CREATE TABLE t_json_async_insert (data Object('json')) ENGINE = MergeTree ORDER BY tuple()"
$CLICKHOUSE_CLIENT --async_insert=1 --wait_for_async_insert=1 -q 'INSERT INTO t_json_async_insert FORMAT JSONAsObject {"aaa"}' 2>&1 | grep -o -m1 "Cannot parse object" $CLICKHOUSE_CLIENT --async_insert=1 --wait_for_async_insert=1 -q 'INSERT INTO t_json_async_insert FORMAT JSONAsObject {"aaa"}' 2>&1 | grep -o -m1 "Cannot parse object"
$CLICKHOUSE_CLIENT -q "SELECT count() FROM t_json_async_insert" $CLICKHOUSE_CLIENT -q "SELECT count() FROM t_json_async_insert"

View File

@ -21,7 +21,7 @@ echo '
} }
}' > 02482_object_data.jsonl }' > 02482_object_data.jsonl
$CLICKHOUSE_LOCAL --allow_experimental_object_type=1 -q "select * from file(02482_object_data.jsonl, auto, 'obj Object('json')')" $CLICKHOUSE_LOCAL --allow_experimental_object_type=1 -q "select * from file(02482_object_data.jsonl, auto, 'obj Object(''json'')')"
rm 02482_object_data.jsonl rm 02482_object_data.jsonl

View File

@ -44,7 +44,7 @@ nested.col1 Array(String) NO \N
nested.col2 Array(UInt32) NO \N nested.col2 Array(UInt32) NO \N
nfs Nullable(FixedString(3)) YES \N nfs Nullable(FixedString(3)) YES \N
ns Nullable(String) YES \N ns Nullable(String) YES \N
o Object(\'json\') NO \N o JSON NO \N
p Point NO \N p Point NO \N
pg Polygon NO \N pg Polygon NO \N
r Ring NO \N r Ring NO \N