Merge pull request #70635 from CurtizJ/fix-parsing-to-sparse-column

Fix parsing of some formats into sparse columns
This commit is contained in:
Anton Popov 2024-10-16 07:13:29 +00:00 committed by GitHub
commit 56b9f19ec3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 143 additions and 1 deletions

View File

@ -274,7 +274,8 @@ size_t IRowInputFormat::countRows(size_t)
void IRowInputFormat::setSerializationHints(const SerializationInfoByName & hints)
{
serializations = getPort().getHeader().getSerializations(hints);
if (supportsCustomSerializations())
serializations = getPort().getHeader().getSerializations(hints);
}

View File

@ -59,6 +59,7 @@ protected:
/// `max_block_size` can be ignored.
virtual size_t countRows(size_t max_block_size);
virtual bool supportsCountRows() const { return false; }
virtual bool supportsCustomSerializations() const { return false; }
virtual void readPrefix() {} /// delimiter before begin of result
virtual void readSuffix() {} /// delimiter after end of result

View File

@ -43,6 +43,7 @@ private:
size_t countRows(size_t max_block_size) override;
bool supportsCountRows() const override { return true; }
bool supportsCustomSerializations() const override { return true; }
const String & columnName(size_t i) const;
size_t columnIndex(StringRef name, size_t key_index);

View File

@ -38,6 +38,7 @@ private:
bool supportsCountRows() const override { return true; }
size_t countRows(size_t max_block_size) override;
bool supportsCustomSerializations() const override { return true; }
const FormatSettings format_settings;

View File

@ -48,6 +48,7 @@ protected:
bool isGarbageAfterField(size_t index, ReadBuffer::Position pos) override;
void setReadBuffer(ReadBuffer & in_) override;
void readPrefix() override;
bool supportsCustomSerializations() const override { return true; }
const FormatSettings format_settings;
DataTypes data_types;

View File

@ -0,0 +1,102 @@
Arrow
9260153077572524277
ArrowStream
9260153077572524277
Avro
9260153077572524277
BSONEachRow
9260153077572524277
CSV
9260153077572524277
CSVWithNames
9260153077572524277
CSVWithNamesAndTypes
9260153077572524277
CapnProto
9260153077572524277
CustomSeparated
9260153077572524277
CustomSeparatedWithNames
9260153077572524277
CustomSeparatedWithNamesAndTypes
9260153077572524277
JSON
9260153077572524277
JSONColumns
9260153077572524277
JSONColumnsWithMetadata
9260153077572524277
JSONCompact
9260153077572524277
JSONCompactColumns
9260153077572524277
JSONCompactEachRow
9260153077572524277
JSONCompactEachRowWithNames
9260153077572524277
JSONCompactEachRowWithNamesAndTypes
9260153077572524277
JSONCompactStringsEachRow
9260153077572524277
JSONCompactStringsEachRowWithNames
9260153077572524277
JSONCompactStringsEachRowWithNamesAndTypes
9260153077572524277
JSONEachRow
9260153077572524277
JSONLines
9260153077572524277
JSONObjectEachRow
9260153077572524277
JSONStringsEachRow
9260153077572524277
MsgPack
9260153077572524277
NDJSON
9260153077572524277
Native
9260153077572524277
ORC
9260153077572524277
Parquet
9260153077572524277
Raw
9260153077572524277
RawWithNames
9260153077572524277
RawWithNamesAndTypes
9260153077572524277
RowBinary
9260153077572524277
RowBinaryWithNames
9260153077572524277
RowBinaryWithNamesAndTypes
9260153077572524277
TSKV
9260153077572524277
TSV
9260153077572524277
TSVRaw
9260153077572524277
TSVRawWithNames
9260153077572524277
TSVRawWithNamesAndTypes
9260153077572524277
TSVWithNames
9260153077572524277
TSVWithNamesAndTypes
9260153077572524277
TabSeparated
9260153077572524277
TabSeparatedRaw
9260153077572524277
TabSeparatedRawWithNames
9260153077572524277
TabSeparatedRawWithNamesAndTypes
9260153077572524277
TabSeparatedWithNames
9260153077572524277
TabSeparatedWithNamesAndTypes
9260153077572524277
Values
9260153077572524277

View File

@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Tags: no-fasttest, long
set -e
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
formats=$($CLICKHOUSE_CLIENT --query "
SELECT name FROM system.formats
WHERE is_input AND is_output AND name NOT IN ('Template', 'Npy', 'RawBLOB', 'ProtobufList', 'ProtobufSingle', 'Protobuf', 'LineAsString')
ORDER BY name FORMAT TSV
")
$CLICKHOUSE_CLIENT --query "
DROP TABLE IF EXISTS t_sparse_all_formats;
CREATE TABLE t_sparse_all_formats (a UInt64, b UInt64, c String) ENGINE = MergeTree ORDER BY a;
"
for format in $formats; do
echo $format
$CLICKHOUSE_CLIENT --query "INSERT INTO t_sparse_all_formats(a) SELECT number FROM numbers(1000)"
$CLICKHOUSE_CLIENT --query "SELECT number AS a, 0::UInt64 AS b, '' AS c FROM numbers(1000) FORMAT $format" \
| ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_sparse_all_formats+FORMAT+$format&enable_parsing_to_custom_serialization=1" --data-binary @-
$CLICKHOUSE_CLIENT --query "SELECT number AS a FROM numbers(1000) FORMAT $format" \
| ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_sparse_all_formats(a)+FORMAT+$format&enable_parsing_to_custom_serialization=1" --data-binary @-
$CLICKHOUSE_CLIENT --query "
SELECT sum(sipHash64(*)) FROM t_sparse_all_formats;
TRUNCATE TABLE t_sparse_all_formats;
"
done