Make ParquetMetadata say whether bloom filter is present

This commit is contained in:
Michael Kolupaev 2024-10-23 01:27:10 +00:00
parent 9e2ae7e0c7
commit e3ebe51968
3 changed files with 69 additions and 7 deletions

View File

@ -92,8 +92,9 @@ static NamesAndTypesList getHeaderForParquetMetadata()
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>()),
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>())},
Names{"num_values", "null_count", "distinct_count", "min", "max"}),
DataTypeFactory::instance().get("Bool"),
},
Names{"name", "path", "total_compressed_size", "total_uncompressed_size", "have_statistics", "statistics"}))},
Names{"name", "path", "total_compressed_size", "total_uncompressed_size", "have_statistics", "statistics", "have_bloom_filter"}))},
Names{"num_columns", "num_rows", "total_uncompressed_size", "total_compressed_size", "columns"}))},
};
return names_and_types;
@ -350,6 +351,8 @@ void ParquetMetadataInputFormat::fillColumnChunksMetadata(const std::unique_ptr<
fillColumnStatistics(column_chunk_metadata->statistics(), tuple_column.getColumn(5), row_group_metadata->schema()->Column(column_i)->type_length());
else
tuple_column.getColumn(5).insertDefault();
bool have_bloom_filter = column_chunk_metadata->bloom_filter_offset().has_value();
assert_cast<ColumnUInt8 &>(tuple_column.getColumn(6)).insertValue(have_bloom_filter);
}
array_column.getOffsets().push_back(tuple_column.size());
}

View File

@ -78,7 +78,8 @@
"distinct_count": null,
"min": "0",
"max": "999"
}
},
"have_bloom_filter": false
},
{
"name": "str",
@ -92,7 +93,8 @@
"distinct_count": null,
"min": "Hello0",
"max": "Hello999"
}
},
"have_bloom_filter": false
},
{
"name": "mod",
@ -106,7 +108,8 @@
"distinct_count": null,
"min": "0",
"max": "8"
}
},
"have_bloom_filter": false
}
]
},
@ -128,7 +131,8 @@
"distinct_count": null,
"min": "0",
"max": "999"
}
},
"have_bloom_filter": false
},
{
"name": "str",
@ -142,7 +146,8 @@
"distinct_count": null,
"min": "Hello0",
"max": "Hello999"
}
},
"have_bloom_filter": false
},
{
"name": "mod",
@ -156,7 +161,8 @@
"distinct_count": null,
"min": "0",
"max": "8"
}
},
"have_bloom_filter": false
}
]
}
@ -223,3 +229,55 @@
}
1
1
{
"num_columns": "1",
"num_rows": "5",
"num_row_groups": "1",
"format_version": "1.0",
"metadata_size": "267",
"total_uncompressed_size": "105",
"total_compressed_size": "128",
"columns": [
{
"name": "ipv6",
"path": "ipv6",
"max_definition_level": "0",
"max_repetition_level": "0",
"physical_type": "FIXED_LEN_BYTE_ARRAY",
"logical_type": "None",
"compression": "GZIP",
"total_uncompressed_size": "105",
"total_compressed_size": "128",
"space_saved": "-21.9%",
"encodings": [
"PLAIN",
"BIT_PACKED"
]
}
],
"row_groups": [
{
"num_columns": "1",
"num_rows": "5",
"total_uncompressed_size": "105",
"total_compressed_size": "128",
"columns": [
{
"name": "ipv6",
"path": "ipv6",
"total_compressed_size": "128",
"total_uncompressed_size": "105",
"have_statistics": true,
"statistics": {
"num_values": "5",
"null_count": "0",
"distinct_count": null,
"min": "27 32 150 125 17 250 66 31 157 44 75 218 51 50 19 144 ",
"max": "154 31 90 141 15 7 68 47 190 29 121 145 188 162 234 154 "
},
"have_bloom_filter": true
}
]
}
]
}

View File

@ -17,3 +17,4 @@ $CLICKHOUSE_LOCAL -q "select some_column from file('$CURDIR/data_parquet/02718_d
$CLICKHOUSE_LOCAL -q "select num_columns from file('$CURDIR/data_parquet/02718_data.parquet', ParquetMetadata, 'num_columns Array(UInt32)')" 2>&1 | grep -c "BAD_ARGUMENTS"
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_parquet/ipv6_bloom_filter.gz.parquet', ParquetMetadata) format JSONEachRow" | python3 -m json.tool