2020-02-14 19:48:45 +00:00
|
|
|
#pragma once
|
|
|
|
|
2020-02-21 15:21:31 +00:00
|
|
|
#include <re2/re2.h>
|
2020-03-27 20:10:03 +00:00
|
|
|
#include <re2/stringpiece.h>
|
2020-02-21 15:21:31 +00:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2020-02-14 19:48:45 +00:00
|
|
|
#include <Core/Block.h>
|
|
|
|
#include <Processors/Formats/IRowInputFormat.h>
|
|
|
|
#include <Formats/FormatSettings.h>
|
|
|
|
#include <Formats/FormatFactory.h>
|
2020-02-21 15:21:31 +00:00
|
|
|
#include <IO/PeekableReadBuffer.h>
|
2020-03-27 20:10:03 +00:00
|
|
|
#include <Formats/ParsedTemplateFormatString.h>
|
2020-02-14 19:48:45 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
class ReadBuffer;
|
|
|
|
|
2020-03-27 20:10:03 +00:00
|
|
|
/// Regexp input format.
|
|
|
|
/// This format applies regular expression from format_regexp setting for every line of file
|
|
|
|
/// (the lines must be separated by newline character ('\n') or DOS-style newline ("\r\n")).
|
|
|
|
/// Every matched subpattern will be parsed with the method of corresponding data type
|
|
|
|
/// (according to format_regexp_escaping_rule setting). If the regexp did not match the line,
|
|
|
|
/// if format_regexp_skip_unmatched is 1, the line is silently skipped, if the setting is 0, exception will be thrown.
|
2020-02-14 19:48:45 +00:00
|
|
|
|
|
|
|
class RegexpRowInputFormat : public IRowInputFormat
|
|
|
|
{
|
2020-03-27 20:10:03 +00:00
|
|
|
using ColumnFormat = ParsedTemplateFormatString::ColumnFormat;
|
2020-02-14 19:48:45 +00:00
|
|
|
public:
|
2020-02-18 13:53:12 +00:00
|
|
|
RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_);
|
2020-02-14 19:48:45 +00:00
|
|
|
|
|
|
|
String getName() const override { return "RegexpRowInputFormat"; }
|
|
|
|
|
|
|
|
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
|
|
|
|
|
|
|
|
private:
|
|
|
|
bool readField(size_t index, MutableColumns & columns);
|
|
|
|
void readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext);
|
2020-03-28 01:17:49 +00:00
|
|
|
static ColumnFormat stringToFormat(const String & format);
|
2020-02-14 19:48:45 +00:00
|
|
|
|
2020-02-21 15:21:31 +00:00
|
|
|
PeekableReadBuffer buf;
|
2020-02-14 19:48:45 +00:00
|
|
|
const FormatSettings format_settings;
|
2020-03-27 20:10:03 +00:00
|
|
|
ColumnFormat field_format;
|
2020-02-21 15:21:31 +00:00
|
|
|
|
|
|
|
RE2 regexp;
|
|
|
|
// The vector of fields extracted from line using regexp.
|
2020-03-27 20:10:03 +00:00
|
|
|
std::vector<re2::StringPiece> matched_fields;
|
2020-02-21 15:21:31 +00:00
|
|
|
// These two vectors are needed to use RE2::FullMatchN (function for extracting fields).
|
|
|
|
std::vector<RE2::Arg> re2_arguments;
|
|
|
|
std::vector<RE2::Arg *> re2_arguments_ptrs;
|
2020-02-14 19:48:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|