Merge pull request #44446 from Avogar/arrow-nullables

Respect setting settings.schema_inference_make_columns_nullable in Parquet/ORC/Arrow formats
This commit is contained in:
Kruglov Pavel 2023-01-02 16:05:57 +01:00 committed by GitHub
commit 1c2dc05d6e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 54 additions and 6 deletions

View File

@ -173,8 +173,9 @@ NamesAndTypesList ArrowSchemaReader::readSchema()
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
*schema, stream ? "ArrowStream" : "Arrow", format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference);
return getNamesAndRecursivelyNullableTypes(header);
}
if (format_settings.schema_inference_make_columns_nullable)
return getNamesAndRecursivelyNullableTypes(header);
return header.getNamesAndTypesList();}
void registerInputFormatArrow(FormatFactory & factory)
{
@ -208,12 +209,24 @@ void registerArrowSchemaReader(FormatFactory & factory)
{
return std::make_shared<ArrowSchemaReader>(buf, false, settings);
});
factory.registerAdditionalInfoForSchemaCacheGetter("Arrow", [](const FormatSettings & settings)
{
return fmt::format("schema_inference_make_columns_nullable={}", settings.schema_inference_make_columns_nullable);
});
factory.registerSchemaReader(
"ArrowStream",
[](ReadBuffer & buf, const FormatSettings & settings)
{
return std::make_shared<ArrowSchemaReader>(buf, true, settings);
});}
});
factory.registerAdditionalInfoForSchemaCacheGetter("ArrowStream", [](const FormatSettings & settings)
{
return fmt::format("schema_inference_make_columns_nullable={}", settings.schema_inference_make_columns_nullable);
});
}
}
#else

View File

@ -189,8 +189,9 @@ NamesAndTypesList ORCSchemaReader::readSchema()
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
*schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference);
return getNamesAndRecursivelyNullableTypes(header);
}
if (format_settings.schema_inference_make_columns_nullable)
return getNamesAndRecursivelyNullableTypes(header);
return header.getNamesAndTypesList();}
void registerInputFormatORC(FormatFactory & factory)
{
@ -216,6 +217,11 @@ void registerORCSchemaReader(FormatFactory & factory)
return std::make_shared<ORCSchemaReader>(buf, settings);
}
);
factory.registerAdditionalInfoForSchemaCacheGetter("ORC", [](const FormatSettings & settings)
{
return fmt::format("schema_inference_make_columns_nullable={}", settings.schema_inference_make_columns_nullable);
});
}
}

View File

@ -187,7 +187,9 @@ NamesAndTypesList ParquetSchemaReader::readSchema()
getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped);
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
*schema, "Parquet", format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference);
return getNamesAndRecursivelyNullableTypes(header);
if (format_settings.schema_inference_make_columns_nullable)
return getNamesAndRecursivelyNullableTypes(header);
return header.getNamesAndTypesList();
}
void registerInputFormatParquet(FormatFactory & factory)
@ -214,6 +216,11 @@ void registerParquetSchemaReader(FormatFactory & factory)
return std::make_shared<ParquetSchemaReader>(buf, settings);
}
);
factory.registerAdditionalInfoForSchemaCacheGetter("Parquet", [](const FormatSettings & settings)
{
return fmt::format("schema_inference_make_columns_nullable={}", settings.schema_inference_make_columns_nullable);
});
}
}

View File

@ -0,0 +1,6 @@
number Nullable(UInt64)
number UInt64
number Nullable(Int64)
number Int64
number Nullable(UInt64)
number UInt64

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "select * from numbers(3) format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --table=test -q "desc test" --schema_inference_make_columns_nullable=1;
$CLICKHOUSE_LOCAL -q "select * from numbers(3) format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --table=test -q "desc test" --schema_inference_make_columns_nullable=0;
$CLICKHOUSE_LOCAL -q "select * from numbers(3) format ORC" | $CLICKHOUSE_LOCAL --input-format=ORC --table=test -q "desc test" --schema_inference_make_columns_nullable=1;
$CLICKHOUSE_LOCAL -q "select * from numbers(3) format ORC" | $CLICKHOUSE_LOCAL --input-format=ORC --table=test -q "desc test" --schema_inference_make_columns_nullable=0;
$CLICKHOUSE_LOCAL -q "select * from numbers(3) format Arrow" | $CLICKHOUSE_LOCAL --input-format=Arrow --table=test -q "desc test" --schema_inference_make_columns_nullable=1;
$CLICKHOUSE_LOCAL -q "select * from numbers(3) format Arrow" | $CLICKHOUSE_LOCAL --input-format=Arrow --table=test -q "desc test" --schema_inference_make_columns_nullable=0;