Merge pull request #34306 from ClickHouse/line-as-string-low-performance

Fix terribly low performance of `LineAsString` format
This commit is contained in:
Maksim Kita 2022-02-04 13:11:27 +01:00 committed by GitHub
commit 074b827cf3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 30 additions and 29 deletions

View File

@ -216,6 +216,15 @@ void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf)
readStringUntilCharsInto<' '>(s, buf);
}
template <typename Vector>
void readStringUntilNewlineInto(Vector & s, ReadBuffer & buf)
{
readStringUntilCharsInto<'\n'>(s, buf);
}
template void readStringUntilNewlineInto<PaddedPODArray<UInt8>>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
template void readStringUntilNewlineInto<String>(String & s, ReadBuffer & buf);
template <typename Vector>
void readNullTerminated(Vector & s, ReadBuffer & buf)
{

View File

@ -604,6 +604,9 @@ bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf)
template <typename Vector>
void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf);
template <typename Vector>
void readStringUntilNewlineInto(Vector & s, ReadBuffer & buf);
/// This could be used as template parameter for functions above, if you want to just skip data.
struct NullOutput
{
@ -1387,4 +1390,3 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf);
void readJSONFieldIntoString(String & s, ReadBuffer & buf);
}

View File

@ -2,6 +2,8 @@
#include <Formats/JSONEachRowUtils.h>
#include <base/find_symbols.h>
#include <IO/ReadHelpers.h>
#include <Columns/ColumnString.h>
namespace DB
{
@ -14,7 +16,8 @@ namespace ErrorCodes
LineAsStringRowInputFormat::LineAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) :
IRowInputFormat(header_, in_, std::move(params_))
{
if (header_.columns() > 1 || header_.getDataTypes()[0]->getTypeId() != TypeIndex::String)
if (header_.columns() != 1
|| !typeid_cast<const ColumnString *>(header_.getByPosition(0).column.get()))
{
throw Exception("This input format is only suitable for tables with a single column of type String.", ErrorCodes::INCORRECT_QUERY);
}
@ -27,28 +30,16 @@ void LineAsStringRowInputFormat::resetParser()
void LineAsStringRowInputFormat::readLineObject(IColumn & column)
{
DB::Memory<> object;
ColumnString & column_string = assert_cast<ColumnString &>(column);
auto & chars = column_string.getChars();
auto & offsets = column_string.getOffsets();
char * pos = in->position();
bool need_more_data = true;
readStringUntilNewlineInto(chars, *in);
chars.push_back(0);
offsets.push_back(chars.size());
while (loadAtPosition(*in, object, pos) && need_more_data)
{
pos = find_first_symbols<'\n'>(pos, in->buffer().end());
if (pos == in->buffer().end())
continue;
if (*pos == '\n')
need_more_data = false;
++pos;
}
saveUpToPosition(*in, object, pos);
loadAtPosition(*in, object, pos);
/// Last character is always \n.
column.insertData(object.data(), object.size() - 1);
if (!in->eof())
in->ignore(); /// Skip '\n'
}
bool LineAsStringRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
@ -57,17 +48,16 @@ bool LineAsStringRowInputFormat::readRow(MutableColumns & columns, RowReadExtens
return false;
readLineObject(*columns[0]);
return true;
}
void registerInputFormatLineAsString(FormatFactory & factory)
{
factory.registerInputFormat("LineAsString", [](
ReadBuffer & buf,
const Block & sample,
const RowInputFormatParams & params,
const FormatSettings &)
ReadBuffer & buf,
const Block & sample,
const RowInputFormatParams & params,
const FormatSettings &)
{
return std::make_shared<LineAsStringRowInputFormat>(sample, buf, params);
});
@ -76,9 +66,10 @@ void registerInputFormatLineAsString(FormatFactory & factory)
void registerLineAsStringSchemaReader(FormatFactory & factory)
{
factory.registerExternalSchemaReader("LineAsString", [](
const FormatSettings &)
const FormatSettings &)
{
return std::make_shared<LinaAsStringSchemaReader>();
});
}
}

View File

@ -61,4 +61,3 @@ void registerRawBLOBSchemaReader(FormatFactory & factory)
}
}