mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Improve performance and memory usage for select of subset of columns for some formats
This commit is contained in:
parent
e7296a2b28
commit
b17fec659a
@ -631,7 +631,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
||||
M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \
|
||||
M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \
|
||||
M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \
|
||||
M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \
|
||||
M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \
|
||||
M(Bool, input_format_with_names_use_header, true, "For -WithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \
|
||||
M(Bool, input_format_with_types_use_header, true, "For -WithNamesAndTypes input formats this controls whether format parser should check if data types from the input match data types from the header.", 0) \
|
||||
M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \
|
||||
|
@ -71,7 +71,7 @@ String escapingRuleToString(FormatSettings::EscapingRule escaping_rule)
|
||||
|
||||
void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
|
||||
{
|
||||
String tmp;
|
||||
NullOutput out;
|
||||
constexpr const char * field_name = "<SKIPPED COLUMN>";
|
||||
constexpr size_t field_name_len = 16;
|
||||
switch (escaping_rule)
|
||||
@ -80,19 +80,19 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca
|
||||
/// Empty field, just skip spaces
|
||||
break;
|
||||
case FormatSettings::EscapingRule::Escaped:
|
||||
readEscapedString(tmp, buf);
|
||||
readEscapedStringInto(out, buf);
|
||||
break;
|
||||
case FormatSettings::EscapingRule::Quoted:
|
||||
readQuotedFieldIntoString(tmp, buf);
|
||||
readQuotedFieldInto(out, buf);
|
||||
break;
|
||||
case FormatSettings::EscapingRule::CSV:
|
||||
readCSVString(tmp, buf, format_settings.csv);
|
||||
readCSVStringInto(out, buf, format_settings.csv);
|
||||
break;
|
||||
case FormatSettings::EscapingRule::JSON:
|
||||
skipJSONField(buf, StringRef(field_name, field_name_len));
|
||||
break;
|
||||
case FormatSettings::EscapingRule::Raw:
|
||||
readString(tmp, buf);
|
||||
readStringInto(out, buf);
|
||||
break;
|
||||
default:
|
||||
__builtin_unreachable();
|
||||
@ -219,7 +219,7 @@ String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escapin
|
||||
if constexpr (read_string)
|
||||
readQuotedString(result, buf);
|
||||
else
|
||||
readQuotedFieldIntoString(result, buf);
|
||||
readQuotedField(result, buf);
|
||||
break;
|
||||
case FormatSettings::EscapingRule::JSON:
|
||||
if constexpr (read_string)
|
||||
|
@ -538,19 +538,19 @@ void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & na
|
||||
}
|
||||
|
||||
|
||||
void FormatFactory::markFormatAsColumnOriented(const String & name)
|
||||
void FormatFactory::markFormatSupportsSamplingColumns(const String & name)
|
||||
{
|
||||
auto & target = dict[name].is_column_oriented;
|
||||
auto & target = dict[name].supports_sampling_columns;
|
||||
if (target)
|
||||
throw Exception("FormatFactory: Format " + name + " is already marked as column oriented", ErrorCodes::LOGICAL_ERROR);
|
||||
target = true;
|
||||
}
|
||||
|
||||
|
||||
bool FormatFactory::checkIfFormatIsColumnOriented(const String & name)
|
||||
bool FormatFactory::checkIfFormatSupportsSamplingColumns(const String & name)
|
||||
{
|
||||
const auto & target = getCreators(name);
|
||||
return target.is_column_oriented;
|
||||
return target.supports_sampling_columns;
|
||||
}
|
||||
|
||||
bool FormatFactory::isInputFormat(const String & name) const
|
||||
|
@ -108,7 +108,7 @@ private:
|
||||
SchemaReaderCreator schema_reader_creator;
|
||||
ExternalSchemaReaderCreator external_schema_reader_creator;
|
||||
bool supports_parallel_formatting{false};
|
||||
bool is_column_oriented{false};
|
||||
bool supports_sampling_columns{false};
|
||||
NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker;
|
||||
AppendSupportChecker append_support_checker;
|
||||
};
|
||||
@ -194,9 +194,9 @@ public:
|
||||
void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator);
|
||||
|
||||
void markOutputFormatSupportsParallelFormatting(const String & name);
|
||||
void markFormatAsColumnOriented(const String & name);
|
||||
void markFormatSupportsSamplingColumns(const String & name);
|
||||
|
||||
bool checkIfFormatIsColumnOriented(const String & name);
|
||||
bool checkIfFormatSupportsSamplingColumns(const String & name);
|
||||
|
||||
bool checkIfFormatHasSchemaReader(const String & name);
|
||||
bool checkIfFormatHasExternalSchemaReader(const String & name);
|
||||
|
@ -10,4 +10,10 @@ void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWit
|
||||
register_func(base_format_name + "WithNamesAndTypes", true, true);
|
||||
}
|
||||
|
||||
void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory)
|
||||
{
|
||||
factory.markFormatSupportsSamplingColumns(base_format_name + "WithNames");
|
||||
factory.markFormatSupportsSamplingColumns(base_format_name + "WithNamesAndTypes");
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <Formats/FormatFactory.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -9,4 +10,6 @@ namespace DB
|
||||
using RegisterWithNamesAndTypesFunc = std::function<void(const std::string & format_name, bool with_names, bool with_types)>;
|
||||
void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func);
|
||||
|
||||
void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory);
|
||||
|
||||
}
|
||||
|
@ -700,16 +700,18 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV &
|
||||
if (!buf.hasPendingData())
|
||||
continue;
|
||||
|
||||
/** CSV format can contain insignificant spaces and tabs.
|
||||
if constexpr (!std::is_same_v<Vector, NullOutput>)
|
||||
{
|
||||
/** CSV format can contain insignificant spaces and tabs.
|
||||
* Usually the task of skipping them is for the calling code.
|
||||
* But in this case, it will be difficult to do this, so remove the trailing whitespace by ourself.
|
||||
*/
|
||||
size_t size = s.size();
|
||||
while (size > 0
|
||||
&& (s[size - 1] == ' ' || s[size - 1] == '\t'))
|
||||
--size;
|
||||
size_t size = s.size();
|
||||
while (size > 0 && (s[size - 1] == ' ' || s[size - 1] == '\t'))
|
||||
--size;
|
||||
|
||||
s.resize(size);
|
||||
s.resize(size);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -741,6 +743,7 @@ void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & sett
|
||||
}
|
||||
|
||||
template void readCSVStringInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
|
||||
template void readCSVStringInto<NullOutput>(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings);
|
||||
|
||||
|
||||
template <typename Vector, typename ReturnType>
|
||||
@ -1313,8 +1316,8 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim
|
||||
}
|
||||
|
||||
// Use PeekableReadBuffer to copy field to string after parsing.
|
||||
template <typename ParseFunc>
|
||||
static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc parse_func)
|
||||
template <typename Vector, typename ParseFunc>
|
||||
static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func)
|
||||
{
|
||||
PeekableReadBuffer peekable_buf(buf);
|
||||
peekable_buf.setCheckpoint();
|
||||
@ -1326,8 +1329,8 @@ static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc pa
|
||||
peekable_buf.position() = end;
|
||||
}
|
||||
|
||||
template <char opening_bracket, char closing_bracket>
|
||||
static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf)
|
||||
template <char opening_bracket, char closing_bracket, typename Vector>
|
||||
static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf)
|
||||
{
|
||||
assertChar(opening_bracket, buf);
|
||||
s.push_back(opening_bracket);
|
||||
@ -1363,10 +1366,9 @@ static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf)
|
||||
}
|
||||
}
|
||||
|
||||
void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
|
||||
template <typename Vector>
|
||||
void readQuotedFieldInto(Vector & s, ReadBuffer & buf)
|
||||
{
|
||||
s.clear();
|
||||
|
||||
if (buf.eof())
|
||||
return;
|
||||
|
||||
@ -1386,11 +1388,11 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
|
||||
s.push_back('\'');
|
||||
}
|
||||
else if (*buf.position() == '[')
|
||||
readQuotedFieldInBrackets<'[', ']'>(s, buf);
|
||||
readQuotedFieldInBracketsInto<'[', ']'>(s, buf);
|
||||
else if (*buf.position() == '(')
|
||||
readQuotedFieldInBrackets<'(', ')'>(s, buf);
|
||||
readQuotedFieldInBracketsInto<'(', ')'>(s, buf);
|
||||
else if (*buf.position() == '{')
|
||||
readQuotedFieldInBrackets<'{', '}'>(s, buf);
|
||||
readQuotedFieldInBracketsInto<'{', '}'>(s, buf);
|
||||
else if (checkCharCaseInsensitive('n', buf))
|
||||
{
|
||||
/// NULL or NaN
|
||||
@ -1423,14 +1425,20 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf)
|
||||
Float64 tmp;
|
||||
readFloatText(tmp, in);
|
||||
};
|
||||
readParsedValueIntoString(s, buf, parse_func);
|
||||
readParsedValueInto(s, buf, parse_func);
|
||||
}
|
||||
}
|
||||
|
||||
void readQuotedField(String & s, ReadBuffer & buf)
|
||||
{
|
||||
s.clear();
|
||||
readQuotedFieldInto(s, buf);
|
||||
}
|
||||
|
||||
void readJSONFieldIntoString(String & s, ReadBuffer & buf)
|
||||
{
|
||||
auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); };
|
||||
readParsedValueIntoString(s, buf, parse_func);
|
||||
readParsedValueInto(s, buf, parse_func);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1425,7 +1425,10 @@ struct PcgDeserializer
|
||||
}
|
||||
};
|
||||
|
||||
void readQuotedFieldIntoString(String & s, ReadBuffer & buf);
|
||||
template <typename Vector>
|
||||
void readQuotedFieldInto(Vector & s, ReadBuffer & buf);
|
||||
|
||||
void readQuotedField(String & s, ReadBuffer & buf);
|
||||
|
||||
void readJSONFieldIntoString(String & s, ReadBuffer & buf);
|
||||
|
||||
|
@ -188,7 +188,7 @@ void registerInputFormatArrow(FormatFactory & factory)
|
||||
{
|
||||
return std::make_shared<ArrowBlockInputFormat>(buf, sample, false, format_settings);
|
||||
});
|
||||
factory.markFormatAsColumnOriented("Arrow");
|
||||
factory.markFormatSupportsSamplingColumns("Arrow");
|
||||
factory.registerInputFormat(
|
||||
"ArrowStream",
|
||||
[](ReadBuffer & buf,
|
||||
|
@ -114,6 +114,7 @@ void registerInputFormatRowBinary(FormatFactory & factory)
|
||||
};
|
||||
|
||||
registerWithNamesAndTypes("RowBinary", register_func);
|
||||
factory.markFormatSupportsSamplingColumns("RowBinaryWithNamesAndTypes");
|
||||
factory.registerFileExtension("bin", "RowBinary");
|
||||
}
|
||||
|
||||
|
@ -112,7 +112,9 @@ String CSVFormatReader::readCSVFieldIntoString()
|
||||
|
||||
void CSVFormatReader::skipField()
|
||||
{
|
||||
readCSVFieldIntoString<true>();
|
||||
skipWhitespacesAndTabs(*in);
|
||||
NullOutput out;
|
||||
readCSVStringInto(out, *in, format_settings.csv);
|
||||
}
|
||||
|
||||
void CSVFormatReader::skipRowEndDelimiter()
|
||||
@ -374,6 +376,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory)
|
||||
};
|
||||
|
||||
registerWithNamesAndTypes("CSV", register_func);
|
||||
markFormatWithNamesAndTypesSupportsSamplingColumns("CSV", factory);
|
||||
}
|
||||
|
||||
void registerCSVSchemaReader(FormatFactory & factory)
|
||||
|
@ -310,6 +310,7 @@ void registerInputFormatCapnProto(FormatFactory & factory)
|
||||
return std::make_shared<CapnProtoRowInputFormat>(buf, sample, std::move(params),
|
||||
FormatSchemaInfo(settings, "CapnProto", true), settings);
|
||||
});
|
||||
factory.markFormatSupportsSamplingColumns("CapnProto");
|
||||
factory.registerFileExtension("capnp", "CapnProto");
|
||||
}
|
||||
|
||||
|
@ -333,6 +333,7 @@ void registerInputFormatCustomSeparated(FormatFactory & factory)
|
||||
});
|
||||
};
|
||||
registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func);
|
||||
markFormatWithNamesAndTypesSupportsSamplingColumns(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", factory);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -229,6 +229,7 @@ void registerInputFormatJSONCompactEachRow(FormatFactory & factory)
|
||||
};
|
||||
|
||||
registerWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func);
|
||||
markFormatWithNamesAndTypesSupportsSamplingColumns(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", factory);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -393,6 +393,11 @@ void registerInputFormatJSONEachRow(FormatFactory & factory)
|
||||
{
|
||||
return std::make_shared<JSONEachRowRowInputFormat>(buf, sample, std::move(params), settings, true);
|
||||
});
|
||||
|
||||
factory.markFormatSupportsSamplingColumns("JSONEachRow");
|
||||
factory.markFormatSupportsSamplingColumns("JSONLines");
|
||||
factory.markFormatSupportsSamplingColumns("NDJSON");
|
||||
factory.markFormatSupportsSamplingColumns("JSONStringsEachRow");
|
||||
}
|
||||
|
||||
void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory)
|
||||
|
@ -397,8 +397,8 @@ bool MySQLDumpRowInputFormat::readField(IColumn & column, size_t column_idx)
|
||||
|
||||
void MySQLDumpRowInputFormat::skipField()
|
||||
{
|
||||
String tmp;
|
||||
readQuotedFieldIntoString(tmp, *in);
|
||||
NullOutput out;
|
||||
readQuotedFieldInto(out, *in);
|
||||
}
|
||||
|
||||
MySQLDumpSchemaReader::MySQLDumpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
|
||||
@ -434,7 +434,7 @@ DataTypes MySQLDumpSchemaReader::readRowAndGetDataTypes()
|
||||
if (!data_types.empty())
|
||||
skipFieldDelimiter(in);
|
||||
|
||||
readQuotedFieldIntoString(value, in);
|
||||
readQuotedField(value, in);
|
||||
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
|
||||
data_types.push_back(std::move(type));
|
||||
}
|
||||
|
@ -200,7 +200,7 @@ void registerInputFormatORC(FormatFactory & factory)
|
||||
{
|
||||
return std::make_shared<ORCBlockInputFormat>(buf, sample, settings);
|
||||
});
|
||||
factory.markFormatAsColumnOriented("ORC");
|
||||
factory.markFormatSupportsSamplingColumns("ORC");
|
||||
}
|
||||
|
||||
void registerORCSchemaReader(FormatFactory & factory)
|
||||
|
@ -193,7 +193,7 @@ void registerInputFormatParquet(FormatFactory & factory)
|
||||
{
|
||||
return std::make_shared<ParquetBlockInputFormat>(buf, sample, settings);
|
||||
});
|
||||
factory.markFormatAsColumnOriented("Parquet");
|
||||
factory.markFormatSupportsSamplingColumns("Parquet");
|
||||
}
|
||||
|
||||
void registerParquetSchemaReader(FormatFactory & factory)
|
||||
|
@ -79,7 +79,7 @@ void registerInputFormatProtobufList(FormatFactory & factory)
|
||||
return std::make_shared<ProtobufListInputFormat>(buf, sample, std::move(params),
|
||||
FormatSchemaInfo(settings, "Protobuf", true), settings.protobuf.input_flatten_google_wrappers);
|
||||
});
|
||||
factory.markFormatAsColumnOriented("ProtobufList");
|
||||
factory.markFormatSupportsSamplingColumns("ProtobufList");
|
||||
}
|
||||
|
||||
void registerProtobufListSchemaReader(FormatFactory & factory)
|
||||
|
@ -69,6 +69,7 @@ void registerInputFormatProtobuf(FormatFactory & factory)
|
||||
with_length_delimiter,
|
||||
settings.protobuf.input_flatten_google_wrappers);
|
||||
});
|
||||
factory.markFormatSupportsSamplingColumns(with_length_delimiter ? "Protobuf" : "ProtobufSingle");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -280,6 +280,8 @@ void registerInputFormatTSKV(FormatFactory & factory)
|
||||
{
|
||||
return std::make_shared<TSKVRowInputFormat>(buf, sample, std::move(params), settings);
|
||||
});
|
||||
|
||||
factory.markFormatSupportsSamplingColumns("TSKV");
|
||||
}
|
||||
void registerTSKVSchemaReader(FormatFactory & factory)
|
||||
{
|
||||
|
@ -80,7 +80,11 @@ String TabSeparatedFormatReader::readFieldIntoString()
|
||||
|
||||
void TabSeparatedFormatReader::skipField()
|
||||
{
|
||||
readFieldIntoString();
|
||||
NullOutput out;
|
||||
if (is_raw)
|
||||
readStringInto(out, *in);
|
||||
else
|
||||
readEscapedStringInto(out, *in);
|
||||
}
|
||||
|
||||
void TabSeparatedFormatReader::skipHeaderRow()
|
||||
@ -347,6 +351,8 @@ void registerFileSegmentationEngineTabSeparated(FormatFactory & factory)
|
||||
|
||||
registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func);
|
||||
registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func);
|
||||
markFormatWithNamesAndTypesSupportsSamplingColumns(is_raw ? "TSVRaw" : "TSV", factory);
|
||||
markFormatWithNamesAndTypesSupportsSamplingColumns(is_raw ? "TabSeparatedRaw" : "TabSeparated", factory);
|
||||
}
|
||||
|
||||
// We can use the same segmentation engine for TSKV.
|
||||
|
@ -599,7 +599,7 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes()
|
||||
skipWhitespaceIfAny(buf);
|
||||
}
|
||||
|
||||
readQuotedFieldIntoString(value, buf);
|
||||
readQuotedField(value, buf);
|
||||
auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted);
|
||||
data_types.push_back(std::move(type));
|
||||
}
|
||||
|
@ -476,9 +476,9 @@ private:
|
||||
};
|
||||
|
||||
|
||||
bool StorageHDFS::isColumnOriented() const
|
||||
bool StorageHDFS::supportsSamplingColumns() const
|
||||
{
|
||||
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name);
|
||||
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSamplingColumns(format_name);
|
||||
}
|
||||
|
||||
Pipe StorageHDFS::read(
|
||||
@ -527,7 +527,7 @@ Pipe StorageHDFS::read(
|
||||
|
||||
ColumnsDescription columns_description;
|
||||
Block block_for_format;
|
||||
if (isColumnOriented())
|
||||
if (supportsSamplingColumns())
|
||||
{
|
||||
auto fetch_columns = column_names;
|
||||
const auto & virtuals = getVirtuals();
|
||||
|
@ -57,7 +57,7 @@ public:
|
||||
/// Is is useful because column oriented formats could effectively skip unknown columns
|
||||
/// So we can create a header of only required columns in read method and ask
|
||||
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
|
||||
bool isColumnOriented() const override;
|
||||
bool supportsSamplingColumns() const override;
|
||||
|
||||
static ColumnsDescription getTableStructureFromData(
|
||||
const String & format,
|
||||
|
@ -622,14 +622,14 @@ HiveFilePtr StorageHive::getHiveFileIfNeeded(
|
||||
return hive_file;
|
||||
}
|
||||
|
||||
bool StorageHive::isColumnOriented() const
|
||||
bool StorageHive::supportsSamplingColumns() const
|
||||
{
|
||||
return format_name == "Parquet" || format_name == "ORC";
|
||||
}
|
||||
|
||||
void StorageHive::getActualColumnsToRead(Block & sample_block, const Block & header_block, const NameSet & partition_columns) const
|
||||
{
|
||||
if (!isColumnOriented())
|
||||
if (!supportsSamplingColumns())
|
||||
sample_block = header_block;
|
||||
UInt32 erased_columns = 0;
|
||||
for (const auto & column : partition_columns)
|
||||
@ -795,7 +795,7 @@ std::optional<UInt64>
|
||||
StorageHive::totalRowsImpl(const Settings & settings, const SelectQueryInfo & query_info, ContextPtr context_, PruneLevel prune_level) const
|
||||
{
|
||||
/// Row-based format like Text doesn't support totalRowsByPartitionPredicate
|
||||
if (!isColumnOriented())
|
||||
if (!supportsSamplingColumns())
|
||||
return {};
|
||||
|
||||
auto hive_metastore_client = HiveMetastoreClientFactory::instance().getOrCreate(hive_metastore_url);
|
||||
|
@ -63,7 +63,7 @@ public:
|
||||
|
||||
NamesAndTypesList getVirtuals() const override;
|
||||
|
||||
bool isColumnOriented() const override;
|
||||
bool supportsSamplingColumns() const override;
|
||||
|
||||
std::optional<UInt64> totalRows(const Settings & settings) const override;
|
||||
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr context_) const override;
|
||||
|
@ -585,7 +585,7 @@ public:
|
||||
/// Returns true if all disks of storage are read-only.
|
||||
virtual bool isStaticStorage() const;
|
||||
|
||||
virtual bool isColumnOriented() const { return false; }
|
||||
virtual bool supportsSamplingColumns() const { return false; }
|
||||
|
||||
/// If it is possible to quickly determine exact number of rows in the table at this moment of time, then return it.
|
||||
/// Used for:
|
||||
|
@ -316,9 +316,9 @@ ColumnsDescription StorageFile::getTableStructureFromFile(
|
||||
return readSchemaFromFormat(format, format_settings, read_buffer_iterator, paths.size() > 1, context);
|
||||
}
|
||||
|
||||
bool StorageFile::isColumnOriented() const
|
||||
bool StorageFile::supportsSamplingColumns() const
|
||||
{
|
||||
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name);
|
||||
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSamplingColumns(format_name);
|
||||
}
|
||||
|
||||
StorageFile::StorageFile(int table_fd_, CommonArguments args)
|
||||
@ -465,7 +465,7 @@ public:
|
||||
const ColumnsDescription & columns_description,
|
||||
const FilesInfoPtr & files_info)
|
||||
{
|
||||
if (storage->isColumnOriented())
|
||||
if (storage->supportsSamplingColumns())
|
||||
return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
|
||||
else
|
||||
return getHeader(storage_snapshot->metadata, files_info->need_path_column, files_info->need_file_column);
|
||||
@ -530,7 +530,7 @@ public:
|
||||
|
||||
auto get_block_for_format = [&]() -> Block
|
||||
{
|
||||
if (storage->isColumnOriented())
|
||||
if (storage->supportsSamplingColumns())
|
||||
return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
|
||||
return storage_snapshot->metadata->getSampleBlock();
|
||||
};
|
||||
@ -690,7 +690,7 @@ Pipe StorageFile::read(
|
||||
{
|
||||
const auto get_columns_for_format = [&]() -> ColumnsDescription
|
||||
{
|
||||
if (isColumnOriented())
|
||||
if (supportsSamplingColumns())
|
||||
return ColumnsDescription{
|
||||
storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()};
|
||||
else
|
||||
|
@ -69,11 +69,11 @@ public:
|
||||
|
||||
static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read);
|
||||
|
||||
/// Check if the format is column-oriented.
|
||||
/// Is is useful because column oriented formats could effectively skip unknown columns
|
||||
/// Check if the format supports reading only some sampling of columns.
|
||||
/// Is is useful because such formats could effectively skip unknown columns
|
||||
/// So we can create a header of only required columns in read method and ask
|
||||
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
|
||||
bool isColumnOriented() const override;
|
||||
bool supportsSamplingColumns() const override;
|
||||
|
||||
bool supportsPartitionBy() const override { return true; }
|
||||
|
||||
|
@ -676,9 +676,9 @@ std::shared_ptr<StorageS3Source::IteratorWrapper> StorageS3::createFileIterator(
|
||||
}
|
||||
}
|
||||
|
||||
bool StorageS3::isColumnOriented() const
|
||||
bool StorageS3::supportsSamplingColumns() const
|
||||
{
|
||||
return FormatFactory::instance().checkIfFormatIsColumnOriented(format_name);
|
||||
return FormatFactory::instance().checkIfFormatSupportsSamplingColumns(format_name);
|
||||
}
|
||||
|
||||
Pipe StorageS3::read(
|
||||
@ -707,7 +707,7 @@ Pipe StorageS3::read(
|
||||
|
||||
ColumnsDescription columns_description;
|
||||
Block block_for_format;
|
||||
if (isColumnOriented())
|
||||
if (supportsSamplingColumns())
|
||||
{
|
||||
auto fetch_columns = column_names;
|
||||
const auto & virtuals = getVirtuals();
|
||||
|
@ -234,7 +234,7 @@ private:
|
||||
ContextPtr ctx,
|
||||
std::vector<String> * read_keys_in_distributed_processing = nullptr);
|
||||
|
||||
bool isColumnOriented() const override;
|
||||
bool supportsSamplingColumns() const override;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -582,9 +582,9 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData(
|
||||
return readSchemaFromFormat(format, format_settings, read_buffer_iterator, urls_to_check.size() > 1, context);
|
||||
}
|
||||
|
||||
bool IStorageURLBase::isColumnOriented() const
|
||||
bool IStorageURLBase::supportsSamplingColumns() const
|
||||
{
|
||||
return FormatFactory::instance().checkIfFormatIsColumnOriented(format_name);
|
||||
return FormatFactory::instance().checkIfFormatSupportsSamplingColumns(format_name);
|
||||
}
|
||||
|
||||
Pipe IStorageURLBase::read(
|
||||
@ -600,7 +600,7 @@ Pipe IStorageURLBase::read(
|
||||
|
||||
ColumnsDescription columns_description;
|
||||
Block block_for_format;
|
||||
if (isColumnOriented())
|
||||
if (supportsSamplingColumns())
|
||||
{
|
||||
columns_description = ColumnsDescription{
|
||||
storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()};
|
||||
@ -688,7 +688,7 @@ Pipe StorageURLWithFailover::read(
|
||||
{
|
||||
ColumnsDescription columns_description;
|
||||
Block block_for_format;
|
||||
if (isColumnOriented())
|
||||
if (supportsSamplingColumns())
|
||||
{
|
||||
columns_description = ColumnsDescription{
|
||||
storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()};
|
||||
|
@ -93,7 +93,7 @@ protected:
|
||||
QueryProcessingStage::Enum & processed_stage,
|
||||
size_t max_block_size) const;
|
||||
|
||||
bool isColumnOriented() const override;
|
||||
bool supportsSamplingColumns() const override;
|
||||
|
||||
private:
|
||||
virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0;
|
||||
|
@ -140,7 +140,7 @@ SinkToStoragePtr StorageXDBC::write(const ASTPtr & /* query */, const StorageMet
|
||||
chooseCompressionMethod(uri, compression_method));
|
||||
}
|
||||
|
||||
bool StorageXDBC::isColumnOriented() const
|
||||
bool StorageXDBC::supportsSamplingColumns() const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
@ -67,7 +67,7 @@ private:
|
||||
|
||||
Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const override;
|
||||
|
||||
bool isColumnOriented() const override;
|
||||
bool supportsSamplingColumns() const override;
|
||||
};
|
||||
|
||||
}
|
||||
|
33
tests/performance/formats_columns_sampling.xml
Normal file
33
tests/performance/formats_columns_sampling.xml
Normal file
@ -0,0 +1,33 @@
|
||||
<test>
|
||||
<settings>
|
||||
<max_threads>1</max_threads>
|
||||
</settings>
|
||||
|
||||
<substitutions>
|
||||
<substitution>
|
||||
<name>format</name>
|
||||
<values>
|
||||
<value>TabSeparatedWithNames</value>
|
||||
<value>TabSeparatedRawWithNames</value>
|
||||
<value>CustomSeparatedWithNames</value>
|
||||
<value>CSVWithNames</value>
|
||||
<value>JSONEachRow</value>
|
||||
<value>JSONCompactEachRowWithNames</value>
|
||||
<value>TSKV</value>
|
||||
<value>RowBinaryWithNamesAndTypes</value>
|
||||
<value>Avro</value>
|
||||
<value>ORC</value>
|
||||
<value>Parquet</value>
|
||||
<value>Arrow</value>
|
||||
</values>
|
||||
</substitution>
|
||||
</substitutions>
|
||||
|
||||
<create_query>CREATE TABLE IF NOT EXISTS table_{format} ENGINE = File({format}) AS test.hits</create_query>
|
||||
|
||||
<fill_query>INSERT INTO table_{format} SELECT * FROM test.hits LIMIT 100000</fill_query>
|
||||
|
||||
<query>SELECT WatchID FROM table_{format} FORMAT Null</query>
|
||||
|
||||
<drop_query>DROP TABLE IF EXISTS table_{format}</drop_query>
|
||||
</test>
|
Loading…
Reference in New Issue
Block a user