Add TabSeparatedRawRowInputFormat

This commit is contained in:
hcz 2020-06-28 16:41:56 +08:00
parent 1b1c32fe89
commit 0a8a29272b
3 changed files with 76 additions and 3 deletions

View File

@ -0,0 +1,58 @@
#pragma once
#include <Core/Block.h>
#include <Formats/FormatSettings.h>
#include <IO/ReadBufferFromString.h>
#include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h>
namespace DB
{
/** A stream to input data in tsv format, but without escaping individual values.
* It only supports one string column
*/
class TabSeparatedRawRowInputFormat : public TabSeparatedRowInputFormat
{
public:
/** with_names - the first line is the header with the names of the columns
* with_types - on the next line header with type names
*/
TabSeparatedRawRowInputFormat(
const Block & header_,
ReadBuffer & in_,
const Params & params_,
bool with_names_,
bool with_types_,
const FormatSettings & format_settings_)
: TabSeparatedRowInputFormat(header_, in_, params_, with_names_, with_types_, format_settings_)
{
}
String getName() const override { return "TabSeparatedRawRowInputFormat"; }
bool readField(IColumn & column, const DataTypePtr & type, bool) override
{
// TODO: possible to optimize
std::string buf;
while (!in.eof())
{
char c = *in.position();
if (c == '\n' || c == '\t')
break;
in.ignore();
buf.push_back(c);
}
ReadBufferFromString line_in(buf);
type->deserializeAsWholeText(column, line_in, format_settings);
return true;
}
};
}

View File

@ -3,6 +3,7 @@
#include <IO/Operators.h>
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
#include <Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h>
#include <Formats/verbosePrintString.h>
#include <Formats/FormatFactory.h>
#include <DataTypes/DataTypeNothing.h>
@ -360,6 +361,18 @@ void registerInputFormatProcessorTabSeparated(FormatFactory & factory)
});
}
for (const auto * name : {"TabSeparatedRaw", "TSVRaw"})
{
factory.registerInputFormatProcessor(name, [](
ReadBuffer & buf,
const Block & sample,
IRowInputFormat::Params params,
const FormatSettings & settings)
{
return std::make_shared<TabSeparatedRawRowInputFormat>(sample, buf, params, false, false, settings);
});
}
for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"})
{
factory.registerInputFormatProcessor(name, [](

View File

@ -28,10 +28,14 @@ public:
void resetParser() override;
private:
protected:
bool with_names;
bool with_types;
const FormatSettings format_settings;
virtual bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column);
private:
DataTypes data_types;
using IndexesMap = std::unordered_map<String, size_t>;
@ -43,8 +47,6 @@ private:
std::vector<UInt8> read_columns;
std::vector<size_t> columns_to_fill_with_default_values;
bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column);
void addInputColumn(const String & column_name);
void setupAllColumnsByTableSchema();
void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension);