mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 19:12:03 +00:00
Merge pull request #57053 from Avogar/better-parsing-exceptions
Better exception messages in input formats
This commit is contained in:
commit
570d1c013b
@ -88,7 +88,7 @@ public:
|
|||||||
{
|
{
|
||||||
/// A more understandable error message.
|
/// A more understandable error message.
|
||||||
if (e.code() == DB::ErrorCodes::CANNOT_READ_ALL_DATA || e.code() == DB::ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
if (e.code() == DB::ErrorCodes::CANNOT_READ_ALL_DATA || e.code() == DB::ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
||||||
throw DB::ParsingException(e.code(), "File {} is empty. You must fill it manually with appropriate value.", path);
|
throw DB::Exception(e.code(), "File {} is empty. You must fill it manually with appropriate value.", path);
|
||||||
else
|
else
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
|
@ -616,48 +616,4 @@ ExecutionStatus ExecutionStatus::fromText(const std::string & data)
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
ParsingException::ParsingException() = default;
|
|
||||||
ParsingException::ParsingException(const std::string & msg, int code)
|
|
||||||
: Exception(msg, code)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
/// We use additional field formatted_message_ to make this method const.
|
|
||||||
std::string ParsingException::displayText() const
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
formatted_message = message();
|
|
||||||
bool need_newline = false;
|
|
||||||
if (!file_name.empty())
|
|
||||||
{
|
|
||||||
formatted_message += fmt::format(": (in file/uri {})", file_name);
|
|
||||||
need_newline = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (line_number != -1)
|
|
||||||
{
|
|
||||||
formatted_message += fmt::format(": (at row {})", line_number);
|
|
||||||
need_newline = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (need_newline)
|
|
||||||
formatted_message += "\n";
|
|
||||||
}
|
|
||||||
catch (...) {} // NOLINT(bugprone-empty-catch)
|
|
||||||
|
|
||||||
if (!formatted_message.empty())
|
|
||||||
{
|
|
||||||
std::string result = name();
|
|
||||||
result.append(": ");
|
|
||||||
result.append(formatted_message);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return Exception::displayText();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -235,43 +235,6 @@ private:
|
|||||||
const char * className() const noexcept override { return "DB::ErrnoException"; }
|
const char * className() const noexcept override { return "DB::ErrnoException"; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/// Special class of exceptions, used mostly in ParallelParsingInputFormat for
|
|
||||||
/// more convenient calculation of problem line number.
|
|
||||||
class ParsingException : public Exception
|
|
||||||
{
|
|
||||||
ParsingException(const std::string & msg, int code);
|
|
||||||
public:
|
|
||||||
ParsingException();
|
|
||||||
|
|
||||||
// Format message with fmt::format, like the logging functions.
|
|
||||||
template <typename... Args>
|
|
||||||
ParsingException(int code, FormatStringHelper<Args...> fmt, Args &&... args) : Exception(fmt::format(fmt.fmt_str, std::forward<Args>(args)...), code)
|
|
||||||
{
|
|
||||||
message_format_string = fmt.message_format_string;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string displayText() const override;
|
|
||||||
|
|
||||||
ssize_t getLineNumber() const { return line_number; }
|
|
||||||
void setLineNumber(int line_number_) { line_number = line_number_;}
|
|
||||||
|
|
||||||
String getFileName() const { return file_name; }
|
|
||||||
void setFileName(const String & file_name_) { file_name = file_name_; }
|
|
||||||
|
|
||||||
Exception * clone() const override { return new ParsingException(*this); }
|
|
||||||
void rethrow() const override { throw *this; } // NOLINT
|
|
||||||
|
|
||||||
private:
|
|
||||||
ssize_t line_number{-1};
|
|
||||||
String file_name;
|
|
||||||
mutable std::string formatted_message;
|
|
||||||
|
|
||||||
const char * name() const noexcept override { return "DB::ParsingException"; }
|
|
||||||
const char * className() const noexcept override { return "DB::ParsingException"; }
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
using Exceptions = std::vector<std::exception_ptr>;
|
using Exceptions = std::vector<std::exception_ptr>;
|
||||||
|
|
||||||
/** Try to write an exception to the log (and forget about it).
|
/** Try to write an exception to the log (and forget about it).
|
||||||
|
@ -390,7 +390,7 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams(
|
|||||||
/// Check consistency between offsets and elements subcolumns.
|
/// Check consistency between offsets and elements subcolumns.
|
||||||
/// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER.
|
/// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER.
|
||||||
if (!nested_column->empty() && nested_column->size() != last_offset)
|
if (!nested_column->empty() && nested_column->size() != last_offset)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all array values: read just {} of {}",
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all array values: read just {} of {}",
|
||||||
toString(nested_column->size()), toString(last_offset));
|
toString(nested_column->size()), toString(last_offset));
|
||||||
|
|
||||||
column = std::move(mutable_column);
|
column = std::move(mutable_column);
|
||||||
@ -445,7 +445,7 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r
|
|||||||
if (*istr.position() == ',')
|
if (*istr.position() == ',')
|
||||||
++istr.position();
|
++istr.position();
|
||||||
else
|
else
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT,
|
throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT,
|
||||||
"Cannot read array from text, expected comma or end of array, found '{}'",
|
"Cannot read array from text, expected comma or end of array, found '{}'",
|
||||||
*istr.position());
|
*istr.position());
|
||||||
}
|
}
|
||||||
|
@ -359,7 +359,7 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col
|
|||||||
nested_column.popBack(1);
|
nested_column.popBack(1);
|
||||||
|
|
||||||
if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos)
|
if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos)
|
||||||
throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation "
|
throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation "
|
||||||
"containing '\\t' or '\\n' may not work correctly for large input.");
|
"containing '\\t' or '\\n' may not work correctly for large input.");
|
||||||
|
|
||||||
WriteBufferFromOwnString parsed_value;
|
WriteBufferFromOwnString parsed_value;
|
||||||
@ -367,7 +367,7 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col
|
|||||||
nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings);
|
nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings);
|
||||||
else
|
else
|
||||||
nested_serialization->serializeTextRaw(nested_column, nested_column.size() - 1, parsed_value, settings);
|
nested_serialization->serializeTextRaw(nested_column, nested_column.size() - 1, parsed_value, settings);
|
||||||
throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing \"{}{}\" as Nullable"
|
throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing \"{}{}\" as Nullable"
|
||||||
" at position {}: got \"{}\", which was deserialized as \"{}\". "
|
" at position {}: got \"{}\", which was deserialized as \"{}\". "
|
||||||
"It seems that input data is ill-formatted.",
|
"It seems that input data is ill-formatted.",
|
||||||
std::string(pos, buf.buffer().end()),
|
std::string(pos, buf.buffer().end()),
|
||||||
@ -452,7 +452,7 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re
|
|||||||
/// It can happen only if there is an unquoted string instead of a number.
|
/// It can happen only if there is an unquoted string instead of a number.
|
||||||
/// We also should delete incorrectly deserialized value from nested column.
|
/// We also should delete incorrectly deserialized value from nested column.
|
||||||
nested_column.popBack(1);
|
nested_column.popBack(1);
|
||||||
throw DB::ParsingException(
|
throw DB::Exception(
|
||||||
ErrorCodes::CANNOT_READ_ALL_DATA,
|
ErrorCodes::CANNOT_READ_ALL_DATA,
|
||||||
"Error while parsing Nullable: got an unquoted string {} instead of a number",
|
"Error while parsing Nullable: got an unquoted string {} instead of a number",
|
||||||
String(buf.position(), std::min(10ul, buf.available())));
|
String(buf.position(), std::min(10ul, buf.available())));
|
||||||
@ -589,12 +589,12 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB
|
|||||||
|
|
||||||
if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos
|
if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos
|
||||||
|| null_representation.find('\n') != std::string::npos)
|
|| null_representation.find('\n') != std::string::npos)
|
||||||
throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing "
|
throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing "
|
||||||
"format_csv_delimiter, '\\r' or '\\n' may not work correctly for large input.");
|
"format_csv_delimiter, '\\r' or '\\n' may not work correctly for large input.");
|
||||||
|
|
||||||
WriteBufferFromOwnString parsed_value;
|
WriteBufferFromOwnString parsed_value;
|
||||||
nested_serialization->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings);
|
nested_serialization->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings);
|
||||||
throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing \"{}{}\" as Nullable"
|
throw DB::Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing \"{}{}\" as Nullable"
|
||||||
" at position {}: got \"{}\", which was deserialized as \"{}\". "
|
" at position {}: got \"{}\", which was deserialized as \"{}\". "
|
||||||
"It seems that input data is ill-formatted.",
|
"It seems that input data is ill-formatted.",
|
||||||
std::string(pos, buf.buffer().end()),
|
std::string(pos, buf.buffer().end()),
|
||||||
|
@ -43,7 +43,7 @@ namespace JSONUtils
|
|||||||
{
|
{
|
||||||
const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position());
|
const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position());
|
||||||
if (min_bytes != 0 && current_object_size > 10 * min_bytes)
|
if (min_bytes != 0 && current_object_size > 10 * min_bytes)
|
||||||
throw ParsingException(ErrorCodes::INCORRECT_DATA,
|
throw Exception(ErrorCodes::INCORRECT_DATA,
|
||||||
"Size of JSON object at position {} is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. "
|
"Size of JSON object at position {} is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. "
|
||||||
"Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, "
|
"Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, "
|
||||||
"most likely JSON is malformed", in.count(), min_bytes, current_object_size);
|
"most likely JSON is malformed", in.count(), min_bytes, current_object_size);
|
||||||
|
@ -120,7 +120,7 @@ Block NativeReader::read()
|
|||||||
if (istr.eof())
|
if (istr.eof())
|
||||||
{
|
{
|
||||||
if (use_index)
|
if (use_index)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Input doesn't contain all data for index.");
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Input doesn't contain all data for index.");
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
@ -89,7 +89,7 @@ void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
|
|||||||
else
|
else
|
||||||
out << " before: " << quote << String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position()));
|
out << " before: " << quote << String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position()));
|
||||||
|
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "Cannot parse input: expected {}", out.str());
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "Cannot parse input: expected {}", out.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -562,7 +562,7 @@ static ReturnType readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
|
|||||||
if (buf.eof() || *buf.position() != quote)
|
if (buf.eof() || *buf.position() != quote)
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
|
throw Exception(ErrorCodes::CANNOT_PARSE_QUOTED_STRING,
|
||||||
"Cannot parse quoted string: expected opening quote '{}', got '{}'",
|
"Cannot parse quoted string: expected opening quote '{}', got '{}'",
|
||||||
std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
|
std::string{quote}, buf.eof() ? "EOF" : std::string{*buf.position()});
|
||||||
else
|
else
|
||||||
@ -608,7 +608,7 @@ static ReturnType readAnyQuotedStringInto(Vector & s, ReadBuffer & buf)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_QUOTED_STRING, "Cannot parse quoted string: expected closing quote");
|
throw Exception(ErrorCodes::CANNOT_PARSE_QUOTED_STRING, "Cannot parse quoted string: expected closing quote");
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
}
|
}
|
||||||
@ -958,7 +958,7 @@ ReturnType readJSONStringInto(Vector & s, ReadBuffer & buf)
|
|||||||
auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
|
auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(code, std::move(message));
|
throw Exception(code, std::move(message));
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1009,7 +1009,7 @@ ReturnType readJSONObjectOrArrayPossiblyInvalid(Vector & s, ReadBuffer & buf)
|
|||||||
auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
|
auto error = [](FormatStringHelper<> message [[maybe_unused]], int code [[maybe_unused]])
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(code, std::move(message));
|
throw Exception(code, std::move(message));
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1185,7 +1185,7 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime");
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1212,7 +1212,7 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
|
|||||||
s_pos[size] = 0;
|
s_pos[size] = 0;
|
||||||
|
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1235,7 +1235,7 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
|
|||||||
s_pos[size] = 0;
|
s_pos[size] = 0;
|
||||||
|
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse time component of DateTime {}", s);
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse time component of DateTime {}", s);
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1266,7 +1266,7 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
|
|||||||
if (too_short && negative_multiplier != -1)
|
if (too_short && negative_multiplier != -1)
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime");
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -296,7 +296,7 @@ inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case
|
|||||||
[[fallthrough]];
|
[[fallthrough]];
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value");
|
throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -340,7 +340,7 @@ ReturnType readIntTextImpl(T & x, ReadBuffer & buf)
|
|||||||
if (has_sign)
|
if (has_sign)
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER,
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER,
|
||||||
"Cannot parse number with multiple sign (+/-) characters");
|
"Cannot parse number with multiple sign (+/-) characters");
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
@ -357,7 +357,7 @@ ReturnType readIntTextImpl(T & x, ReadBuffer & buf)
|
|||||||
if (has_sign)
|
if (has_sign)
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER,
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER,
|
||||||
"Cannot parse number with multiple sign (+/-) characters");
|
"Cannot parse number with multiple sign (+/-) characters");
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
@ -368,7 +368,7 @@ ReturnType readIntTextImpl(T & x, ReadBuffer & buf)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Unsigned type must not contain '-' symbol");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Unsigned type must not contain '-' symbol");
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
}
|
}
|
||||||
@ -430,7 +430,7 @@ end:
|
|||||||
if (has_sign && !has_number)
|
if (has_sign && !has_number)
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER,
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER,
|
||||||
"Cannot parse number with a sign character but without any numeric character");
|
"Cannot parse number with a sign character but without any numeric character");
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
@ -837,7 +837,7 @@ inline ReturnType readUUIDTextImpl(UUID & uuid, ReadBuffer & buf)
|
|||||||
|
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
{
|
{
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", s);
|
throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", s);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -855,7 +855,7 @@ inline ReturnType readUUIDTextImpl(UUID & uuid, ReadBuffer & buf)
|
|||||||
|
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
{
|
{
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", s);
|
throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", s);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -881,7 +881,7 @@ inline ReturnType readIPv4TextImpl(IPv4 & ip, ReadBuffer & buf)
|
|||||||
return ReturnType(true);
|
return ReturnType(true);
|
||||||
|
|
||||||
if constexpr (std::is_same_v<ReturnType, void>)
|
if constexpr (std::is_same_v<ReturnType, void>)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_IPV4, "Cannot parse IPv4 {}", std::string_view(buf.position(), buf.available()));
|
throw Exception(ErrorCodes::CANNOT_PARSE_IPV4, "Cannot parse IPv4 {}", std::string_view(buf.position(), buf.available()));
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
}
|
}
|
||||||
@ -903,7 +903,7 @@ inline ReturnType readIPv6TextImpl(IPv6 & ip, ReadBuffer & buf)
|
|||||||
return ReturnType(true);
|
return ReturnType(true);
|
||||||
|
|
||||||
if constexpr (std::is_same_v<ReturnType, void>)
|
if constexpr (std::is_same_v<ReturnType, void>)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_IPV6, "Cannot parse IPv6 {}", std::string_view(buf.position(), buf.available()));
|
throw Exception(ErrorCodes::CANNOT_PARSE_IPV6, "Cannot parse IPv6 {}", std::string_view(buf.position(), buf.available()));
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
}
|
}
|
||||||
@ -944,7 +944,7 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons
|
|||||||
if (!buf.eof() && !isNumericASCII(*buf.position()))
|
if (!buf.eof() && !isNumericASCII(*buf.position()))
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse datetime");
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse datetime");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1017,7 +1017,7 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re
|
|||||||
{
|
{
|
||||||
readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut);
|
readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut);
|
||||||
}
|
}
|
||||||
catch (const DB::ParsingException &)
|
catch (const DB::Exception &)
|
||||||
{
|
{
|
||||||
if (buf.eof() || *buf.position() != '.')
|
if (buf.eof() || *buf.position() != '.')
|
||||||
throw;
|
throw;
|
||||||
@ -1125,7 +1125,7 @@ inline void readDateTimeText(LocalDateTime & datetime, ReadBuffer & buf)
|
|||||||
if (10 != size)
|
if (10 != size)
|
||||||
{
|
{
|
||||||
s[size] = 0;
|
s[size] = 0;
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
|
||||||
}
|
}
|
||||||
|
|
||||||
datetime.year((s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'));
|
datetime.year((s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'));
|
||||||
@ -1141,7 +1141,7 @@ inline void readDateTimeText(LocalDateTime & datetime, ReadBuffer & buf)
|
|||||||
if (8 != size)
|
if (8 != size)
|
||||||
{
|
{
|
||||||
s[size] = 0;
|
s[size] = 0;
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse time component of DateTime {}", s);
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse time component of DateTime {}", s);
|
||||||
}
|
}
|
||||||
|
|
||||||
datetime.hour((s[0] - '0') * 10 + (s[1] - '0'));
|
datetime.hour((s[0] - '0') * 10 + (s[1] - '0'));
|
||||||
@ -1174,7 +1174,7 @@ inline ReturnType readTimeTextImpl(time_t & time, ReadBuffer & buf)
|
|||||||
s[size] = 0;
|
s[size] = 0;
|
||||||
|
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
|
throw Exception(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot parse DateTime {}", s);
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1482,7 +1482,7 @@ void readQuoted(std::vector<T> & x, ReadBuffer & buf)
|
|||||||
if (*buf.position() == ',')
|
if (*buf.position() == ',')
|
||||||
++buf.position();
|
++buf.position();
|
||||||
else
|
else
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Cannot read array from text");
|
throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Cannot read array from text");
|
||||||
}
|
}
|
||||||
|
|
||||||
first = false;
|
first = false;
|
||||||
@ -1505,7 +1505,7 @@ void readDoubleQuoted(std::vector<T> & x, ReadBuffer & buf)
|
|||||||
if (*buf.position() == ',')
|
if (*buf.position() == ',')
|
||||||
++buf.position();
|
++buf.position();
|
||||||
else
|
else
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Cannot read array from text");
|
throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Cannot read array from text");
|
||||||
}
|
}
|
||||||
|
|
||||||
first = false;
|
first = false;
|
||||||
|
@ -95,7 +95,7 @@ ReturnType parseDateTimeBestEffortImpl(
|
|||||||
FmtArgs && ...fmt_args [[maybe_unused]])
|
FmtArgs && ...fmt_args [[maybe_unused]])
|
||||||
{
|
{
|
||||||
if constexpr (std::is_same_v<ReturnType, void>)
|
if constexpr (std::is_same_v<ReturnType, void>)
|
||||||
throw ParsingException(error_code, std::move(fmt_string), std::forward<FmtArgs>(fmt_args)...);
|
throw Exception(error_code, std::move(fmt_string), std::forward<FmtArgs>(fmt_args)...);
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
@ -121,7 +121,7 @@ inline bool readDigits(ReadBuffer & buf, T & x, uint32_t & digits, int32_t & exp
|
|||||||
if (!tryReadIntText(addition_exp, buf))
|
if (!tryReadIntText(addition_exp, buf))
|
||||||
{
|
{
|
||||||
if constexpr (_throw_on_error)
|
if constexpr (_throw_on_error)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot parse exponent while reading decimal");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot parse exponent while reading decimal");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -134,7 +134,7 @@ inline bool readDigits(ReadBuffer & buf, T & x, uint32_t & digits, int32_t & exp
|
|||||||
if (digits_only)
|
if (digits_only)
|
||||||
{
|
{
|
||||||
if constexpr (_throw_on_error)
|
if constexpr (_throw_on_error)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Unexpected symbol while reading decimal");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Unexpected symbol while reading decimal");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
stop = true;
|
stop = true;
|
||||||
|
@ -160,7 +160,7 @@ ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
|
|||||||
if (unlikely(res.ec != std::errc()))
|
if (unlikely(res.ec != std::errc()))
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(
|
throw Exception(
|
||||||
ErrorCodes::CANNOT_PARSE_NUMBER,
|
ErrorCodes::CANNOT_PARSE_NUMBER,
|
||||||
"Cannot read floating point value here: {}",
|
"Cannot read floating point value here: {}",
|
||||||
String(initial_position, buf.buffer().end() - initial_position));
|
String(initial_position, buf.buffer().end() - initial_position));
|
||||||
@ -253,7 +253,7 @@ ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
|
|||||||
if (unlikely(res.ec != std::errc() || res.ptr - tmp_buf != num_copied_chars))
|
if (unlikely(res.ec != std::errc() || res.ptr - tmp_buf != num_copied_chars))
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(
|
throw Exception(
|
||||||
ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value here: {}", String(tmp_buf, num_copied_chars));
|
ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value here: {}", String(tmp_buf, num_copied_chars));
|
||||||
else
|
else
|
||||||
return ReturnType(false);
|
return ReturnType(false);
|
||||||
@ -342,7 +342,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
|
|||||||
if (in.eof())
|
if (in.eof())
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -400,7 +400,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
|
|||||||
if (in.eof())
|
if (in.eof())
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -438,7 +438,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
|
|||||||
if (in.eof())
|
if (in.eof())
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: no digits read");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: no digits read");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -449,14 +449,14 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
|
|||||||
if (in.eof())
|
if (in.eof())
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after plus sign");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after plus sign");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (negative)
|
else if (negative)
|
||||||
{
|
{
|
||||||
if constexpr (throw_exception)
|
if constexpr (throw_exception)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: plus after minus sign");
|
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: plus after minus sign");
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <Processors/Formats/IInputFormat.h>
|
#include <Processors/Formats/IInputFormat.h>
|
||||||
#include <IO/ReadBuffer.h>
|
#include <IO/ReadBuffer.h>
|
||||||
|
#include <IO/WithFileName.h>
|
||||||
|
#include <Common/Exception.h>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -11,6 +12,21 @@ IInputFormat::IInputFormat(Block header, ReadBuffer * in_)
|
|||||||
column_mapping = std::make_shared<ColumnMapping>();
|
column_mapping = std::make_shared<ColumnMapping>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Chunk IInputFormat::generate()
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return read();
|
||||||
|
}
|
||||||
|
catch (Exception & e)
|
||||||
|
{
|
||||||
|
auto file_name = getFileNameFromReadBuffer(getReadBuffer());
|
||||||
|
if (!file_name.empty())
|
||||||
|
e.addMessage(fmt::format("(in file/uri {})", file_name));
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void IInputFormat::resetParser()
|
void IInputFormat::resetParser()
|
||||||
{
|
{
|
||||||
chassert(in);
|
chassert(in);
|
||||||
|
@ -27,6 +27,11 @@ public:
|
|||||||
/// ReadBuffer can be nullptr for random-access formats.
|
/// ReadBuffer can be nullptr for random-access formats.
|
||||||
IInputFormat(Block header, ReadBuffer * in_);
|
IInputFormat(Block header, ReadBuffer * in_);
|
||||||
|
|
||||||
|
Chunk generate() override;
|
||||||
|
|
||||||
|
/// All data reading from the read buffer must be performed by this method.
|
||||||
|
virtual Chunk read() = 0;
|
||||||
|
|
||||||
/** In some usecase (hello Kafka) we need to read a lot of tiny streams in exactly the same format.
|
/** In some usecase (hello Kafka) we need to read a lot of tiny streams in exactly the same format.
|
||||||
* The recreating of parser for each small stream takes too long, so we introduce a method
|
* The recreating of parser for each small stream takes too long, so we introduce a method
|
||||||
* resetParser() which allow to reset the state of parser to continue reading of
|
* resetParser() which allow to reset the state of parser to continue reading of
|
||||||
@ -49,8 +54,9 @@ public:
|
|||||||
/// Must be called from ParallelParsingInputFormat before readPrefix
|
/// Must be called from ParallelParsingInputFormat before readPrefix
|
||||||
void setColumnMapping(ColumnMappingPtr column_mapping_) { column_mapping = column_mapping_; }
|
void setColumnMapping(ColumnMappingPtr column_mapping_) { column_mapping = column_mapping_; }
|
||||||
|
|
||||||
size_t getCurrentUnitNumber() const { return current_unit_number; }
|
/// Set the number of rows that was already read in
|
||||||
void setCurrentUnitNumber(size_t current_unit_number_) { current_unit_number = current_unit_number_; }
|
/// parallel parsing before creating this parser.
|
||||||
|
virtual void setRowsReadBefore(size_t /*rows*/) {}
|
||||||
|
|
||||||
void addBuffer(std::unique_ptr<ReadBuffer> buffer) { owned_buffers.emplace_back(std::move(buffer)); }
|
void addBuffer(std::unique_ptr<ReadBuffer> buffer) { owned_buffers.emplace_back(std::move(buffer)); }
|
||||||
|
|
||||||
@ -72,9 +78,6 @@ protected:
|
|||||||
bool need_only_count = false;
|
bool need_only_count = false;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Number of currently parsed chunk (if parallel parsing is enabled)
|
|
||||||
size_t current_unit_number = 0;
|
|
||||||
|
|
||||||
std::vector<std::unique_ptr<ReadBuffer>> owned_buffers;
|
std::vector<std::unique_ptr<ReadBuffer>> owned_buffers;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -83,7 +83,7 @@ void IRowInputFormat::logError()
|
|||||||
errors_logger->logError(InputFormatErrorsLogger::ErrorEntry{now_time, total_rows, diagnostic, raw_data});
|
errors_logger->logError(InputFormatErrorsLogger::ErrorEntry{now_time, total_rows, diagnostic, raw_data});
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk IRowInputFormat::generate()
|
Chunk IRowInputFormat::read()
|
||||||
{
|
{
|
||||||
if (total_rows == 0)
|
if (total_rows == 0)
|
||||||
{
|
{
|
||||||
@ -93,10 +93,6 @@ Chunk IRowInputFormat::generate()
|
|||||||
}
|
}
|
||||||
catch (Exception & e)
|
catch (Exception & e)
|
||||||
{
|
{
|
||||||
auto file_name = getFileNameFromReadBuffer(getReadBuffer());
|
|
||||||
if (!file_name.empty())
|
|
||||||
e.addMessage(fmt::format("(in file/uri {})", file_name));
|
|
||||||
|
|
||||||
e.addMessage("(while reading header)");
|
e.addMessage("(while reading header)");
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
@ -132,8 +128,6 @@ Chunk IRowInputFormat::generate()
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
++total_rows;
|
|
||||||
|
|
||||||
info.read_columns.clear();
|
info.read_columns.clear();
|
||||||
continue_reading = readRow(columns, info);
|
continue_reading = readRow(columns, info);
|
||||||
|
|
||||||
@ -148,6 +142,8 @@ Chunk IRowInputFormat::generate()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
++total_rows;
|
||||||
|
|
||||||
/// Some formats may read row AND say the read is finished.
|
/// Some formats may read row AND say the read is finished.
|
||||||
/// For such a case, get the number or rows from first column.
|
/// For such a case, get the number or rows from first column.
|
||||||
if (!columns.empty())
|
if (!columns.empty())
|
||||||
@ -162,6 +158,8 @@ Chunk IRowInputFormat::generate()
|
|||||||
}
|
}
|
||||||
catch (Exception & e)
|
catch (Exception & e)
|
||||||
{
|
{
|
||||||
|
++total_rows;
|
||||||
|
|
||||||
/// Logic for possible skipping of errors.
|
/// Logic for possible skipping of errors.
|
||||||
|
|
||||||
if (!isParseError(e.code()))
|
if (!isParseError(e.code()))
|
||||||
@ -204,27 +202,6 @@ Chunk IRowInputFormat::generate()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (ParsingException & e)
|
|
||||||
{
|
|
||||||
String verbose_diagnostic;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
verbose_diagnostic = getDiagnosticInfo();
|
|
||||||
}
|
|
||||||
catch (const Exception & exception)
|
|
||||||
{
|
|
||||||
verbose_diagnostic = "Cannot get verbose diagnostic: " + exception.message();
|
|
||||||
}
|
|
||||||
catch (...) // NOLINT(bugprone-empty-catch)
|
|
||||||
{
|
|
||||||
/// Error while trying to obtain verbose diagnostic. Ok to ignore.
|
|
||||||
}
|
|
||||||
|
|
||||||
e.setFileName(getFileNameFromReadBuffer(getReadBuffer()));
|
|
||||||
e.setLineNumber(static_cast<int>(total_rows));
|
|
||||||
e.addMessage(verbose_diagnostic);
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
catch (Exception & e)
|
catch (Exception & e)
|
||||||
{
|
{
|
||||||
if (!isParseError(e.code()))
|
if (!isParseError(e.code()))
|
||||||
@ -244,10 +221,6 @@ Chunk IRowInputFormat::generate()
|
|||||||
/// Error while trying to obtain verbose diagnostic. Ok to ignore.
|
/// Error while trying to obtain verbose diagnostic. Ok to ignore.
|
||||||
}
|
}
|
||||||
|
|
||||||
auto file_name = getFileNameFromReadBuffer(getReadBuffer());
|
|
||||||
if (!file_name.empty())
|
|
||||||
e.addMessage(fmt::format("(in file/uri {})", file_name));
|
|
||||||
|
|
||||||
e.addMessage(fmt::format("(at row {})\n", total_rows));
|
e.addMessage(fmt::format("(at row {})\n", total_rows));
|
||||||
e.addMessage(verbose_diagnostic);
|
e.addMessage(verbose_diagnostic);
|
||||||
throw;
|
throw;
|
||||||
|
@ -42,7 +42,7 @@ public:
|
|||||||
|
|
||||||
IRowInputFormat(Block header, ReadBuffer & in_, Params params_);
|
IRowInputFormat(Block header, ReadBuffer & in_, Params params_);
|
||||||
|
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void resetParser() override;
|
void resetParser() override;
|
||||||
|
|
||||||
@ -79,10 +79,12 @@ protected:
|
|||||||
|
|
||||||
const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
|
const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
|
||||||
|
|
||||||
size_t getTotalRows() const { return total_rows; }
|
size_t getRowNum() const { return total_rows; }
|
||||||
|
|
||||||
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
||||||
|
|
||||||
|
void setRowsReadBefore(size_t rows) override { total_rows = rows; }
|
||||||
|
|
||||||
Serializations serializations;
|
Serializations serializations;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -28,7 +28,7 @@ ArrowBlockInputFormat::ArrowBlockInputFormat(ReadBuffer & in_, const Block & hea
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk ArrowBlockInputFormat::generate()
|
Chunk ArrowBlockInputFormat::read()
|
||||||
{
|
{
|
||||||
Chunk res;
|
Chunk res;
|
||||||
block_missing_values.clear();
|
block_missing_values.clear();
|
||||||
@ -64,7 +64,7 @@ Chunk ArrowBlockInputFormat::generate()
|
|||||||
{
|
{
|
||||||
auto rows = file_reader->RecordBatchCountRows(record_batch_current++);
|
auto rows = file_reader->RecordBatchCountRows(record_batch_current++);
|
||||||
if (!rows.ok())
|
if (!rows.ok())
|
||||||
throw ParsingException(
|
throw Exception(
|
||||||
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of Arrow data: {}", rows.status().ToString());
|
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of Arrow data: {}", rows.status().ToString());
|
||||||
return getChunkForCount(*rows);
|
return getChunkForCount(*rows);
|
||||||
}
|
}
|
||||||
@ -73,12 +73,12 @@ Chunk ArrowBlockInputFormat::generate()
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!batch_result.ok())
|
if (!batch_result.ok())
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA,
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
|
||||||
"Error while reading batch of Arrow data: {}", batch_result.status().ToString());
|
"Error while reading batch of Arrow data: {}", batch_result.status().ToString());
|
||||||
|
|
||||||
auto table_result = arrow::Table::FromRecordBatches({*batch_result});
|
auto table_result = arrow::Table::FromRecordBatches({*batch_result});
|
||||||
if (!table_result.ok())
|
if (!table_result.ok())
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA,
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
|
||||||
"Error while reading batch of Arrow data: {}", table_result.status().ToString());
|
"Error while reading batch of Arrow data: {}", table_result.status().ToString());
|
||||||
|
|
||||||
++record_batch_current;
|
++record_batch_current;
|
||||||
@ -213,7 +213,7 @@ std::optional<size_t> ArrowSchemaReader::readNumberOrRows()
|
|||||||
|
|
||||||
auto rows = file_reader->CountRows();
|
auto rows = file_reader->CountRows();
|
||||||
if (!rows.ok())
|
if (!rows.ok())
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of Arrow data: {}", rows.status().ToString());
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of Arrow data: {}", rows.status().ToString());
|
||||||
|
|
||||||
return *rows;
|
return *rows;
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,7 @@ public:
|
|||||||
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void onCancel() override
|
void onCancel() override
|
||||||
{
|
{
|
||||||
|
@ -186,7 +186,7 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No
|
|||||||
tmp = decoder.decodeBytes();
|
tmp = decoder.decodeBytes();
|
||||||
|
|
||||||
if (tmp.size() > field_type_size || tmp.empty())
|
if (tmp.size() > field_type_size || tmp.empty())
|
||||||
throw ParsingException(
|
throw Exception(
|
||||||
ErrorCodes::CANNOT_PARSE_UUID,
|
ErrorCodes::CANNOT_PARSE_UUID,
|
||||||
"Cannot parse type {}, expected non-empty binary data with size equal to or less than {}, got {}",
|
"Cannot parse type {}, expected non-empty binary data with size equal to or less than {}, got {}",
|
||||||
target_type->getName(),
|
target_type->getName(),
|
||||||
@ -274,7 +274,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(const avro
|
|||||||
{
|
{
|
||||||
decoder.decodeString(tmp);
|
decoder.decodeString(tmp);
|
||||||
if (tmp.length() != 36)
|
if (tmp.length() != 36)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", tmp);
|
throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse uuid {}", tmp);
|
||||||
|
|
||||||
const UUID uuid = parseUUID({reinterpret_cast<const UInt8 *>(tmp.data()), tmp.length()});
|
const UUID uuid = parseUUID({reinterpret_cast<const UInt8 *>(tmp.data()), tmp.length()});
|
||||||
assert_cast<DataTypeUUID::ColumnType &>(column).insertValue(uuid);
|
assert_cast<DataTypeUUID::ColumnType &>(column).insertValue(uuid);
|
||||||
@ -530,7 +530,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(const avro
|
|||||||
{
|
{
|
||||||
decoder.decodeFixed(fixed_size, tmp);
|
decoder.decodeFixed(fixed_size, tmp);
|
||||||
if (tmp.size() != 36)
|
if (tmp.size() != 36)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse UUID from type Fixed, because it's size ({}) is not equal to the size of UUID (36)", fixed_size);
|
throw Exception(ErrorCodes::CANNOT_PARSE_UUID, "Cannot parse UUID from type Fixed, because it's size ({}) is not equal to the size of UUID (36)", fixed_size);
|
||||||
|
|
||||||
const UUID uuid = parseUUID({reinterpret_cast<const UInt8 *>(tmp.data()), tmp.size()});
|
const UUID uuid = parseUUID({reinterpret_cast<const UInt8 *>(tmp.data()), tmp.size()});
|
||||||
assert_cast<DataTypeUUID::ColumnType &>(column).insertValue(uuid);
|
assert_cast<DataTypeUUID::ColumnType &>(column).insertValue(uuid);
|
||||||
|
@ -1031,17 +1031,17 @@ fileSegmentationEngineBSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t
|
|||||||
readBinaryLittleEndian(document_size, in);
|
readBinaryLittleEndian(document_size, in);
|
||||||
|
|
||||||
if (document_size < sizeof(document_size))
|
if (document_size < sizeof(document_size))
|
||||||
throw ParsingException(ErrorCodes::INCORRECT_DATA, "Size of BSON document is invalid");
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Size of BSON document is invalid");
|
||||||
|
|
||||||
if (min_bytes != 0 && document_size > 10 * min_bytes)
|
if (min_bytes != 0 && document_size > 10 * min_bytes)
|
||||||
throw ParsingException(
|
throw Exception(
|
||||||
ErrorCodes::INCORRECT_DATA,
|
ErrorCodes::INCORRECT_DATA,
|
||||||
"Size of BSON document is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. Increase "
|
"Size of BSON document is extremely large. Expected not greater than {} bytes, but current is {} bytes per row. Increase "
|
||||||
"the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely BSON is malformed",
|
"the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely BSON is malformed",
|
||||||
min_bytes, document_size);
|
min_bytes, document_size);
|
||||||
|
|
||||||
if (document_size < sizeof(document_size))
|
if (document_size < sizeof(document_size))
|
||||||
throw ParsingException(ErrorCodes::INCORRECT_DATA, "Size of BSON document is invalid");
|
throw Exception(ErrorCodes::INCORRECT_DATA, "Size of BSON document is invalid");
|
||||||
|
|
||||||
size_t old_size = memory.size();
|
size_t old_size = memory.size();
|
||||||
memory.resize(old_size + document_size);
|
memory.resize(old_size + document_size);
|
||||||
|
@ -57,9 +57,6 @@ public:
|
|||||||
void resetParser() override;
|
void resetParser() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void readPrefix() override { }
|
|
||||||
void readSuffix() override { }
|
|
||||||
|
|
||||||
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
|
bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
|
||||||
bool allowSyncAfterError() const override { return true; }
|
bool allowSyncAfterError() const override { return true; }
|
||||||
void syncAfterError() override;
|
void syncAfterError() override;
|
||||||
|
@ -888,7 +888,7 @@ void DWARFBlockInputFormat::parseRanges(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk DWARFBlockInputFormat::generate()
|
Chunk DWARFBlockInputFormat::read()
|
||||||
{
|
{
|
||||||
initializeIfNeeded();
|
initializeIfNeeded();
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ public:
|
|||||||
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void onCancel() override
|
void onCancel() override
|
||||||
{
|
{
|
||||||
|
@ -109,7 +109,7 @@ void JSONColumnsBlockInputFormatBase::setReadBuffer(ReadBuffer & in_)
|
|||||||
IInputFormat::setReadBuffer(in_);
|
IInputFormat::setReadBuffer(in_);
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk JSONColumnsBlockInputFormatBase::generate()
|
Chunk JSONColumnsBlockInputFormatBase::read()
|
||||||
{
|
{
|
||||||
MutableColumns columns = getPort().getHeader().cloneEmptyColumns();
|
MutableColumns columns = getPort().getHeader().cloneEmptyColumns();
|
||||||
block_missing_values.clear();
|
block_missing_values.clear();
|
||||||
|
@ -56,7 +56,7 @@ public:
|
|||||||
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
size_t readColumn(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name);
|
size_t readColumn(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name);
|
||||||
|
|
||||||
|
@ -142,7 +142,7 @@ inline bool JSONEachRowRowInputFormat::advanceToNextKey(size_t key_index)
|
|||||||
skipWhitespaceIfAny(*in);
|
skipWhitespaceIfAny(*in);
|
||||||
|
|
||||||
if (in->eof())
|
if (in->eof())
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream while parsing JSONEachRow format");
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream while parsing JSONEachRow format");
|
||||||
else if (*in->position() == '}')
|
else if (*in->position() == '}')
|
||||||
{
|
{
|
||||||
++in->position();
|
++in->position();
|
||||||
@ -205,7 +205,7 @@ bool JSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi
|
|||||||
return false;
|
return false;
|
||||||
skipWhitespaceIfAny(*in);
|
skipWhitespaceIfAny(*in);
|
||||||
|
|
||||||
bool is_first_row = getCurrentUnitNumber() == 0 && getTotalRows() == 1;
|
bool is_first_row = getRowNum() == 0;
|
||||||
if (checkEndOfData(is_first_row))
|
if (checkEndOfData(is_first_row))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -308,7 +308,7 @@ size_t JSONEachRowRowInputFormat::countRows(size_t max_block_size)
|
|||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
size_t num_rows = 0;
|
size_t num_rows = 0;
|
||||||
bool is_first_row = getCurrentUnitNumber() == 0 && getTotalRows() == 0;
|
bool is_first_row = getRowNum() == 0;
|
||||||
skipWhitespaceIfAny(*in);
|
skipWhitespaceIfAny(*in);
|
||||||
while (num_rows < max_block_size && !checkEndOfData(is_first_row))
|
while (num_rows < max_block_size && !checkEndOfData(is_first_row))
|
||||||
{
|
{
|
||||||
|
@ -35,7 +35,7 @@ public:
|
|||||||
reader->resetParser();
|
reader->resetParser();
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk generate() override
|
Chunk read() override
|
||||||
{
|
{
|
||||||
block_missing_values.clear();
|
block_missing_values.clear();
|
||||||
size_t block_start = getDataOffsetMaybeCompressed(*in);
|
size_t block_start = getDataOffsetMaybeCompressed(*in);
|
||||||
|
@ -905,7 +905,7 @@ bool NativeORCBlockInputFormat::prepareStripeReader()
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk NativeORCBlockInputFormat::generate()
|
Chunk NativeORCBlockInputFormat::read()
|
||||||
{
|
{
|
||||||
block_missing_values.clear();
|
block_missing_values.clear();
|
||||||
|
|
||||||
|
@ -62,7 +62,7 @@ public:
|
|||||||
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void onCancel() override { is_stopped = 1; }
|
void onCancel() override { is_stopped = 1; }
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_, const
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk ORCBlockInputFormat::generate()
|
Chunk ORCBlockInputFormat::read()
|
||||||
{
|
{
|
||||||
block_missing_values.clear();
|
block_missing_values.clear();
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ Chunk ORCBlockInputFormat::generate()
|
|||||||
|
|
||||||
auto batch_result = file_reader->ReadStripe(stripe_current, include_indices);
|
auto batch_result = file_reader->ReadStripe(stripe_current, include_indices);
|
||||||
if (!batch_result.ok())
|
if (!batch_result.ok())
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", batch_result.status().ToString());
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", batch_result.status().ToString());
|
||||||
|
|
||||||
auto batch = batch_result.ValueOrDie();
|
auto batch = batch_result.ValueOrDie();
|
||||||
if (!batch)
|
if (!batch)
|
||||||
@ -56,7 +56,7 @@ Chunk ORCBlockInputFormat::generate()
|
|||||||
|
|
||||||
auto table_result = arrow::Table::FromRecordBatches({batch});
|
auto table_result = arrow::Table::FromRecordBatches({batch});
|
||||||
if (!table_result.ok())
|
if (!table_result.ok())
|
||||||
throw ParsingException(
|
throw Exception(
|
||||||
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_result.status().ToString());
|
ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_result.status().ToString());
|
||||||
|
|
||||||
/// We should extract the number of rows directly from the stripe, because in case when
|
/// We should extract the number of rows directly from the stripe, because in case when
|
||||||
|
@ -32,7 +32,7 @@ public:
|
|||||||
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
size_t getApproxBytesReadForChunk() const override { return approx_bytes_read_for_chunk; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void onCancel() override
|
void onCancel() override
|
||||||
{
|
{
|
||||||
|
@ -23,7 +23,7 @@ OneInputFormat::OneInputFormat(const Block & header, ReadBuffer & in_) : IInputF
|
|||||||
header.getByPosition(0).type->getName());
|
header.getByPosition(0).type->getName());
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk OneInputFormat::generate()
|
Chunk OneInputFormat::read()
|
||||||
{
|
{
|
||||||
if (done)
|
if (done)
|
||||||
return {};
|
return {};
|
||||||
|
@ -14,7 +14,7 @@ public:
|
|||||||
String getName() const override { return "One"; }
|
String getName() const override { return "One"; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool done = false;
|
bool done = false;
|
||||||
|
@ -61,7 +61,7 @@ void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupPtr thread
|
|||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
onBackgroundException(successfully_read_rows_count);
|
onBackgroundException();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,7 +90,7 @@ void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupPtr thread_grou
|
|||||||
ReadBuffer read_buffer(unit.segment.data(), unit.segment.size(), 0);
|
ReadBuffer read_buffer(unit.segment.data(), unit.segment.size(), 0);
|
||||||
|
|
||||||
InputFormatPtr input_format = internal_parser_creator(read_buffer);
|
InputFormatPtr input_format = internal_parser_creator(read_buffer);
|
||||||
input_format->setCurrentUnitNumber(current_ticket_number);
|
input_format->setRowsReadBefore(unit.offset);
|
||||||
input_format->setErrorsLogger(errors_logger);
|
input_format->setErrorsLogger(errors_logger);
|
||||||
InternalParser parser(input_format);
|
InternalParser parser(input_format);
|
||||||
|
|
||||||
@ -132,28 +132,16 @@ void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupPtr thread_grou
|
|||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
onBackgroundException(unit.offset);
|
onBackgroundException();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ParallelParsingInputFormat::onBackgroundException(size_t offset)
|
void ParallelParsingInputFormat::onBackgroundException()
|
||||||
{
|
{
|
||||||
std::lock_guard lock(mutex);
|
std::lock_guard lock(mutex);
|
||||||
if (!background_exception)
|
if (!background_exception)
|
||||||
{
|
|
||||||
background_exception = std::current_exception();
|
background_exception = std::current_exception();
|
||||||
if (ParsingException * e = exception_cast<ParsingException *>(background_exception))
|
|
||||||
{
|
|
||||||
/// NOTE: it is not that safe to use line number hack here (may exceed INT_MAX)
|
|
||||||
if (e->getLineNumber() != -1)
|
|
||||||
e->setLineNumber(static_cast<int>(e->getLineNumber() + offset));
|
|
||||||
|
|
||||||
auto file_name = getFileNameFromReadBuffer(getReadBuffer());
|
|
||||||
if (!file_name.empty())
|
|
||||||
e->setFileName(file_name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_server)
|
if (is_server)
|
||||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
@ -164,7 +152,7 @@ void ParallelParsingInputFormat::onBackgroundException(size_t offset)
|
|||||||
segmentator_condvar.notify_all();
|
segmentator_condvar.notify_all();
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk ParallelParsingInputFormat::generate()
|
Chunk ParallelParsingInputFormat::read()
|
||||||
{
|
{
|
||||||
/// Delayed launching of segmentator thread
|
/// Delayed launching of segmentator thread
|
||||||
if (unlikely(!parsing_started.exchange(true)))
|
if (unlikely(!parsing_started.exchange(true)))
|
||||||
|
@ -135,7 +135,7 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
Chunk generate() override final;
|
Chunk read() override final;
|
||||||
|
|
||||||
void onCancel() override final
|
void onCancel() override final
|
||||||
{
|
{
|
||||||
@ -333,7 +333,7 @@ private:
|
|||||||
/// threads. This function is used by segmentator and parsed threads.
|
/// threads. This function is used by segmentator and parsed threads.
|
||||||
/// readImpl() is called from the main thread, so the exception handling
|
/// readImpl() is called from the main thread, so the exception handling
|
||||||
/// is different.
|
/// is different.
|
||||||
void onBackgroundException(size_t offset);
|
void onBackgroundException();
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -570,7 +570,7 @@ void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_batch_idx, std::un
|
|||||||
|
|
||||||
// We may be able to schedule more work now, but can't call scheduleMoreWorkIfNeeded() right
|
// We may be able to schedule more work now, but can't call scheduleMoreWorkIfNeeded() right
|
||||||
// here because we're running on the same thread pool, so it'll deadlock if thread limit is
|
// here because we're running on the same thread pool, so it'll deadlock if thread limit is
|
||||||
// reached. Wake up generate() instead.
|
// reached. Wake up read() instead.
|
||||||
condvar.notify_all();
|
condvar.notify_all();
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -579,7 +579,7 @@ void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_batch_idx, std::un
|
|||||||
|
|
||||||
auto batch = row_group_batch.record_batch_reader->Next();
|
auto batch = row_group_batch.record_batch_reader->Next();
|
||||||
if (!batch.ok())
|
if (!batch.ok())
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}", batch.status().ToString());
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}", batch.status().ToString());
|
||||||
|
|
||||||
if (!*batch)
|
if (!*batch)
|
||||||
{
|
{
|
||||||
@ -637,7 +637,7 @@ void ParquetBlockInputFormat::scheduleMoreWorkIfNeeded(std::optional<size_t> row
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk ParquetBlockInputFormat::generate()
|
Chunk ParquetBlockInputFormat::read()
|
||||||
{
|
{
|
||||||
initializeIfNeeded();
|
initializeIfNeeded();
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ public:
|
|||||||
size_t getApproxBytesReadForChunk() const override { return previous_approx_bytes_read_for_chunk; }
|
size_t getApproxBytesReadForChunk() const override { return previous_approx_bytes_read_for_chunk; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void onCancel() override
|
void onCancel() override
|
||||||
{
|
{
|
||||||
@ -142,7 +142,7 @@ private:
|
|||||||
// reading its data (using RAM). Row group becomes inactive when we finish reading and
|
// reading its data (using RAM). Row group becomes inactive when we finish reading and
|
||||||
// delivering all its blocks and free the RAM. Size of the window is max_decoding_threads.
|
// delivering all its blocks and free the RAM. Size of the window is max_decoding_threads.
|
||||||
//
|
//
|
||||||
// Decoded blocks are placed in `pending_chunks` queue, then picked up by generate().
|
// Decoded blocks are placed in `pending_chunks` queue, then picked up by read().
|
||||||
// If row group decoding runs too far ahead of delivery (by `max_pending_chunks_per_row_group`
|
// If row group decoding runs too far ahead of delivery (by `max_pending_chunks_per_row_group`
|
||||||
// chunks), we pause the stream for the row group, to avoid using too much memory when decoded
|
// chunks), we pause the stream for the row group, to avoid using too much memory when decoded
|
||||||
// chunks are much bigger than the compressed data.
|
// chunks are much bigger than the compressed data.
|
||||||
@ -150,7 +150,7 @@ private:
|
|||||||
// Also:
|
// Also:
|
||||||
// * If preserve_order = true, we deliver chunks strictly in order of increasing row group.
|
// * If preserve_order = true, we deliver chunks strictly in order of increasing row group.
|
||||||
// Decoding may still proceed in later row groups.
|
// Decoding may still proceed in later row groups.
|
||||||
// * If max_decoding_threads <= 1, we run all tasks inline in generate(), without thread pool.
|
// * If max_decoding_threads <= 1, we run all tasks inline in read(), without thread pool.
|
||||||
|
|
||||||
// Potential improvements:
|
// Potential improvements:
|
||||||
// * Plan all read ranges ahead of time, for the whole file, and do prefetching for them
|
// * Plan all read ranges ahead of time, for the whole file, and do prefetching for them
|
||||||
@ -189,7 +189,7 @@ private:
|
|||||||
|
|
||||||
Status status = Status::NotStarted;
|
Status status = Status::NotStarted;
|
||||||
|
|
||||||
// Window of chunks that were decoded but not returned from generate():
|
// Window of chunks that were decoded but not returned from read():
|
||||||
//
|
//
|
||||||
// (delivered) next_chunk_idx
|
// (delivered) next_chunk_idx
|
||||||
// v v v
|
// v v v
|
||||||
@ -215,7 +215,7 @@ private:
|
|||||||
std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column;
|
std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Chunk ready to be delivered by generate().
|
// Chunk ready to be delivered by read().
|
||||||
struct PendingChunk
|
struct PendingChunk
|
||||||
{
|
{
|
||||||
Chunk chunk;
|
Chunk chunk;
|
||||||
@ -265,7 +265,7 @@ private:
|
|||||||
// Done NotStarted
|
// Done NotStarted
|
||||||
|
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
// Wakes up the generate() call, if any.
|
// Wakes up the read() call, if any.
|
||||||
std::condition_variable condvar;
|
std::condition_variable condvar;
|
||||||
|
|
||||||
std::vector<RowGroupBatchState> row_group_batches;
|
std::vector<RowGroupBatchState> row_group_batches;
|
||||||
|
@ -140,7 +140,7 @@ ParquetMetadataInputFormat::ParquetMetadataInputFormat(ReadBuffer & in_, Block h
|
|||||||
checkHeader(getPort().getHeader());
|
checkHeader(getPort().getHeader());
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk ParquetMetadataInputFormat::generate()
|
Chunk ParquetMetadataInputFormat::read()
|
||||||
{
|
{
|
||||||
Chunk res;
|
Chunk res;
|
||||||
if (done)
|
if (done)
|
||||||
|
@ -63,7 +63,7 @@ public:
|
|||||||
void resetParser() override;
|
void resetParser() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void onCancel() override
|
void onCancel() override
|
||||||
{
|
{
|
||||||
|
@ -61,7 +61,7 @@ bool ProtobufListInputFormat::readRow(MutableColumns & columns, RowReadExtension
|
|||||||
|
|
||||||
size_t ProtobufListInputFormat::countRows(size_t max_block_size)
|
size_t ProtobufListInputFormat::countRows(size_t max_block_size)
|
||||||
{
|
{
|
||||||
if (getTotalRows() == 0)
|
if (getRowNum() == 0)
|
||||||
reader->startMessage(true);
|
reader->startMessage(true);
|
||||||
|
|
||||||
if (reader->eof())
|
if (reader->eof())
|
||||||
|
@ -92,7 +92,7 @@ static bool readName(ReadBuffer & buf, StringRef & ref, String & tmp)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream while reading key name from TSKV format");
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream while reading key name from TSKV format");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -161,7 +161,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex
|
|||||||
|
|
||||||
if (in->eof())
|
if (in->eof())
|
||||||
{
|
{
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream after field in TSKV format: {}", name_ref.toString());
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected end of stream after field in TSKV format: {}", name_ref.toString());
|
||||||
}
|
}
|
||||||
else if (*in->position() == '\t')
|
else if (*in->position() == '\t')
|
||||||
{
|
{
|
||||||
|
@ -21,7 +21,7 @@ namespace ErrorCodes
|
|||||||
|
|
||||||
[[noreturn]] static void throwUnexpectedEof(size_t row_num)
|
[[noreturn]] static void throwUnexpectedEof(size_t row_num)
|
||||||
{
|
{
|
||||||
throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected EOF while parsing row {}. "
|
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Unexpected EOF while parsing row {}. "
|
||||||
"Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
|
"Maybe last row has wrong format or input doesn't contain specified suffix before EOF.",
|
||||||
std::to_string(row_num));
|
std::to_string(row_num));
|
||||||
}
|
}
|
||||||
@ -121,7 +121,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension
|
|||||||
|
|
||||||
updateDiagnosticInfo();
|
updateDiagnosticInfo();
|
||||||
|
|
||||||
if (likely(row_num != 1))
|
if (likely(getRowNum() != 0))
|
||||||
format_reader->skipRowBetweenDelimiter();
|
format_reader->skipRowBetweenDelimiter();
|
||||||
|
|
||||||
extra.read_columns.assign(columns.size(), false);
|
extra.read_columns.assign(columns.size(), false);
|
||||||
@ -160,7 +160,7 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type,
|
|||||||
catch (Exception & e)
|
catch (Exception & e)
|
||||||
{
|
{
|
||||||
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
|
||||||
throwUnexpectedEof(row_num);
|
throwUnexpectedEof(getRowNum());
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -198,7 +198,7 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col
|
|||||||
|
|
||||||
out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n";
|
out << "\nUsing format string (from format_schema_rows): " << row_format.dump() << "\n";
|
||||||
out << "\nTrying to parse next row, because suffix does not match:\n";
|
out << "\nTrying to parse next row, because suffix does not match:\n";
|
||||||
if (likely(row_num != 1) && !parseDelimiterWithDiagnosticInfo(out, *buf, row_between_delimiter, "delimiter between rows", ignore_spaces))
|
if (likely(getRowNum() != 0) && !parseDelimiterWithDiagnosticInfo(out, *buf, row_between_delimiter, "delimiter between rows", ignore_spaces))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
for (size_t i = 0; i < row_format.columnsCount(); ++i)
|
||||||
|
@ -98,7 +98,7 @@ bool ValuesBlockInputFormat::skipToNextRow(ReadBuffer * buf, size_t min_chunk_by
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Chunk ValuesBlockInputFormat::generate()
|
Chunk ValuesBlockInputFormat::read()
|
||||||
{
|
{
|
||||||
if (total_rows == 0)
|
if (total_rows == 0)
|
||||||
readPrefix();
|
readPrefix();
|
||||||
|
@ -58,7 +58,7 @@ private:
|
|||||||
|
|
||||||
using ConstantExpressionTemplates = std::vector<std::optional<ConstantExpressionTemplate>>;
|
using ConstantExpressionTemplates = std::vector<std::optional<ConstantExpressionTemplate>>;
|
||||||
|
|
||||||
Chunk generate() override;
|
Chunk read() override;
|
||||||
|
|
||||||
void readRow(MutableColumns & columns, size_t row_num);
|
void readRow(MutableColumns & columns, size_t row_num);
|
||||||
void readUntilTheEndOfRowAndReTokenize(size_t current_column_idx);
|
void readUntilTheEndOfRowAndReTokenize(size_t current_column_idx);
|
||||||
|
@ -26,8 +26,6 @@ RowInputFormatWithDiagnosticInfo::RowInputFormatWithDiagnosticInfo(const Block &
|
|||||||
|
|
||||||
void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
|
void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
|
||||||
{
|
{
|
||||||
++row_num;
|
|
||||||
|
|
||||||
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
|
bytes_read_at_start_of_buffer_on_prev_row = bytes_read_at_start_of_buffer_on_current_row;
|
||||||
bytes_read_at_start_of_buffer_on_current_row = in->count() - in->offset();
|
bytes_read_at_start_of_buffer_on_current_row = in->count() - in->offset();
|
||||||
|
|
||||||
@ -73,7 +71,7 @@ std::pair<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawD
|
|||||||
{
|
{
|
||||||
in->position() = in->buffer().begin() + offset_of_prev_row;
|
in->position() = in->buffer().begin() + offset_of_prev_row;
|
||||||
|
|
||||||
out_diag << "\nRow " << (row_num - 1) << ":\n";
|
out_diag << "\nRow " << getRowNum() - 1 << ":\n";
|
||||||
if (!parseRowAndPrintDiagnosticInfo(columns, out_diag))
|
if (!parseRowAndPrintDiagnosticInfo(columns, out_diag))
|
||||||
return std::make_pair(out_diag.str(), out_data.str());
|
return std::make_pair(out_diag.str(), out_data.str());
|
||||||
}
|
}
|
||||||
@ -96,7 +94,7 @@ std::pair<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawD
|
|||||||
++data;
|
++data;
|
||||||
}
|
}
|
||||||
|
|
||||||
out_diag << "\nRow " << row_num << ":\n";
|
out_diag << "\nRow " << getRowNum() << ":\n";
|
||||||
parseRowAndPrintDiagnosticInfo(columns, out_diag);
|
parseRowAndPrintDiagnosticInfo(columns, out_diag);
|
||||||
out_diag << "\n";
|
out_diag << "\n";
|
||||||
|
|
||||||
@ -193,7 +191,6 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co
|
|||||||
void RowInputFormatWithDiagnosticInfo::resetParser()
|
void RowInputFormatWithDiagnosticInfo::resetParser()
|
||||||
{
|
{
|
||||||
IRowInputFormat::resetParser();
|
IRowInputFormat::resetParser();
|
||||||
row_num = 0;
|
|
||||||
bytes_read_at_start_of_buffer_on_current_row = 0;
|
bytes_read_at_start_of_buffer_on_current_row = 0;
|
||||||
bytes_read_at_start_of_buffer_on_prev_row = 0;
|
bytes_read_at_start_of_buffer_on_prev_row = 0;
|
||||||
offset_of_current_row = std::numeric_limits<size_t>::max();
|
offset_of_current_row = std::numeric_limits<size_t>::max();
|
||||||
|
@ -29,9 +29,6 @@ protected:
|
|||||||
virtual void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) = 0;
|
virtual void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) = 0;
|
||||||
virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0;
|
virtual bool isGarbageAfterField(size_t after_input_pos_idx, ReadBuffer::Position pos) = 0;
|
||||||
|
|
||||||
/// For convenient diagnostics in case of an error.
|
|
||||||
size_t row_num = 0;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// How many bytes were read, not counting those still in the buffer.
|
/// How many bytes were read, not counting those still in the buffer.
|
||||||
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
|
size_t bytes_read_at_start_of_buffer_on_current_row = 0;
|
||||||
|
@ -66,11 +66,6 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes(
|
|||||||
|
|
||||||
void RowInputFormatWithNamesAndTypes::readPrefix()
|
void RowInputFormatWithNamesAndTypes::readPrefix()
|
||||||
{
|
{
|
||||||
/// This is a bit of abstraction leakage, but we need it in parallel parsing:
|
|
||||||
/// we check if this InputFormat is working with the "real" beginning of the data.
|
|
||||||
if (getCurrentUnitNumber() != 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/// Search and remove BOM only in textual formats (CSV, TSV etc), not in binary ones (RowBinary*).
|
/// Search and remove BOM only in textual formats (CSV, TSV etc), not in binary ones (RowBinary*).
|
||||||
/// Also, we assume that column name or type cannot contain BOM, so, if format has header,
|
/// Also, we assume that column name or type cannot contain BOM, so, if format has header,
|
||||||
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
|
/// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it.
|
||||||
@ -206,7 +201,7 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE
|
|||||||
|
|
||||||
updateDiagnosticInfo();
|
updateDiagnosticInfo();
|
||||||
|
|
||||||
if (likely(row_num != 1 || getCurrentUnitNumber() != 0 || (getCurrentUnitNumber() == 0 && (with_names || with_types || is_header_detected))))
|
if (likely(getRowNum() != 0 || with_names || with_types || is_header_detected))
|
||||||
format_reader->skipRowBetweenDelimiter();
|
format_reader->skipRowBetweenDelimiter();
|
||||||
|
|
||||||
format_reader->skipRowStartDelimiter();
|
format_reader->skipRowStartDelimiter();
|
||||||
@ -270,7 +265,7 @@ size_t RowInputFormatWithNamesAndTypes::countRows(size_t max_block_size)
|
|||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
size_t num_rows = 0;
|
size_t num_rows = 0;
|
||||||
bool is_first_row = getTotalRows() == 0 && !with_names && !with_types && !is_header_detected;
|
bool is_first_row = getRowNum() == 0 && !with_names && !with_types && !is_header_detected;
|
||||||
while (!format_reader->checkForSuffix() && num_rows < max_block_size)
|
while (!format_reader->checkForSuffix() && num_rows < max_block_size)
|
||||||
{
|
{
|
||||||
if (likely(!is_first_row))
|
if (likely(!is_first_row))
|
||||||
@ -323,7 +318,7 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu
|
|||||||
if (!format_reader->tryParseSuffixWithDiagnosticInfo(out))
|
if (!format_reader->tryParseSuffixWithDiagnosticInfo(out))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (likely(row_num != 1) && !format_reader->parseRowBetweenDelimiterWithDiagnosticInfo(out))
|
if (likely(getRowNum() != 0) && !format_reader->parseRowBetweenDelimiterWithDiagnosticInfo(out))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (!format_reader->parseRowStartWithDiagnosticInfo(out))
|
if (!format_reader->parseRowStartWithDiagnosticInfo(out))
|
||||||
|
@ -29,38 +29,38 @@ void ProxyV1Handler::run()
|
|||||||
|
|
||||||
// read "PROXY"
|
// read "PROXY"
|
||||||
if (!readWord(5, word, eol) || word != "PROXY" || eol)
|
if (!readWord(5, word, eol) || word != "PROXY" || eol)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
// read "TCP4" or "TCP6" or "UNKNOWN"
|
// read "TCP4" or "TCP6" or "UNKNOWN"
|
||||||
if (!readWord(7, word, eol))
|
if (!readWord(7, word, eol))
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
if (word != "TCP4" && word != "TCP6" && word != "UNKNOWN")
|
if (word != "TCP4" && word != "TCP6" && word != "UNKNOWN")
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
if (word == "UNKNOWN" && eol)
|
if (word == "UNKNOWN" && eol)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (eol)
|
if (eol)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
// read address
|
// read address
|
||||||
if (!readWord(39, word, eol) || eol)
|
if (!readWord(39, word, eol) || eol)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
stack_data.forwarded_for = std::move(word);
|
stack_data.forwarded_for = std::move(word);
|
||||||
|
|
||||||
// read address
|
// read address
|
||||||
if (!readWord(39, word, eol) || eol)
|
if (!readWord(39, word, eol) || eol)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
// read port
|
// read port
|
||||||
if (!readWord(5, word, eol) || eol)
|
if (!readWord(5, word, eol) || eol)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
// read port and "\r\n"
|
// read port and "\r\n"
|
||||||
if (!readWord(5, word, eol) || !eol)
|
if (!readWord(5, word, eol) || !eol)
|
||||||
throw ParsingException(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
throw Exception(ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED, "PROXY protocol violation");
|
||||||
|
|
||||||
if (!stack_data.forwarded_for.empty())
|
if (!stack_data.forwarded_for.empty())
|
||||||
LOG_TRACE(log, "Forwarded client address from PROXY header: {}", stack_data.forwarded_for);
|
LOG_TRACE(log, "Forwarded client address from PROXY header: {}", stack_data.forwarded_for);
|
||||||
|
@ -294,7 +294,7 @@ def test_bad_messages_parsing_exception(kafka_cluster, max_retries=20):
|
|||||||
]:
|
]:
|
||||||
print(format_name)
|
print(format_name)
|
||||||
|
|
||||||
kafka_create_topic(admin_client, f"{format_name}_err")
|
kafka_create_topic(admin_client, f"{format_name}_parsing_err")
|
||||||
|
|
||||||
instance.query(
|
instance.query(
|
||||||
f"""
|
f"""
|
||||||
@ -305,7 +305,7 @@ def test_bad_messages_parsing_exception(kafka_cluster, max_retries=20):
|
|||||||
CREATE TABLE kafka_{format_name} (key UInt64, value UInt64)
|
CREATE TABLE kafka_{format_name} (key UInt64, value UInt64)
|
||||||
ENGINE = Kafka
|
ENGINE = Kafka
|
||||||
SETTINGS kafka_broker_list = 'kafka1:19092',
|
SETTINGS kafka_broker_list = 'kafka1:19092',
|
||||||
kafka_topic_list = '{format_name}_err',
|
kafka_topic_list = '{format_name}_parsing_err',
|
||||||
kafka_group_name = '{format_name}',
|
kafka_group_name = '{format_name}',
|
||||||
kafka_format = '{format_name}',
|
kafka_format = '{format_name}',
|
||||||
kafka_num_consumers = 1;
|
kafka_num_consumers = 1;
|
||||||
@ -316,16 +316,18 @@ def test_bad_messages_parsing_exception(kafka_cluster, max_retries=20):
|
|||||||
)
|
)
|
||||||
|
|
||||||
kafka_produce(
|
kafka_produce(
|
||||||
kafka_cluster, f"{format_name}_err", ["qwertyuiop", "asdfghjkl", "zxcvbnm"]
|
kafka_cluster,
|
||||||
|
f"{format_name}_parsing_err",
|
||||||
|
["qwertyuiop", "asdfghjkl", "zxcvbnm"],
|
||||||
)
|
)
|
||||||
|
|
||||||
expected_result = """avro::Exception: Invalid data file. Magic does not match: : while parsing Kafka message (topic: Avro_err, partition: 0, offset: 0)\\'|1|1|1|default|kafka_Avro
|
expected_result = """avro::Exception: Invalid data file. Magic does not match: : while parsing Kafka message (topic: Avro_parsing_err, partition: 0, offset: 0)\\'|1|1|1|default|kafka_Avro
|
||||||
Cannot parse input: expected \\'{\\' before: \\'qwertyuiop\\': while parsing Kafka message (topic: JSONEachRow_err, partition: 0, offset: 0|1|1|1|default|kafka_JSONEachRow
|
Cannot parse input: expected \\'{\\' before: \\'qwertyuiop\\': (at row 1)\\n: while parsing Kafka message (topic: JSONEachRow_parsing_err, partition:|1|1|1|default|kafka_JSONEachRow
|
||||||
"""
|
"""
|
||||||
# filter out stacktrace in exceptions.text[1] because it is hardly stable enough
|
# filter out stacktrace in exceptions.text[1] because it is hardly stable enough
|
||||||
result_system_kafka_consumers = instance.query_with_retry(
|
result_system_kafka_consumers = instance.query_with_retry(
|
||||||
"""
|
"""
|
||||||
SELECT substr(exceptions.text[1], 1, 131), length(exceptions.text) > 1 AND length(exceptions.text) < 15, length(exceptions.time) > 1 AND length(exceptions.time) < 15, abs(dateDiff('second', exceptions.time[1], now())) < 40, database, table FROM system.kafka_consumers WHERE table in('kafka_Avro', 'kafka_JSONEachRow') ORDER BY table, assignments.partition_id[1]
|
SELECT substr(exceptions.text[1], 1, 139), length(exceptions.text) > 1 AND length(exceptions.text) < 15, length(exceptions.time) > 1 AND length(exceptions.time) < 15, abs(dateDiff('second', exceptions.time[1], now())) < 40, database, table FROM system.kafka_consumers WHERE table in('kafka_Avro', 'kafka_JSONEachRow') ORDER BY table, assignments.partition_id[1]
|
||||||
""",
|
""",
|
||||||
retry_count=max_retries,
|
retry_count=max_retries,
|
||||||
sleep_time=1,
|
sleep_time=1,
|
||||||
@ -338,7 +340,7 @@ Cannot parse input: expected \\'{\\' before: \\'qwertyuiop\\': while parsing Kaf
|
|||||||
"Avro",
|
"Avro",
|
||||||
"JSONEachRow",
|
"JSONEachRow",
|
||||||
]:
|
]:
|
||||||
kafka_delete_topic(admin_client, f"{format_name}_err")
|
kafka_delete_topic(admin_client, f"{format_name}_parsing_err")
|
||||||
|
|
||||||
|
|
||||||
def test_bad_messages_to_mv(kafka_cluster, max_retries=20):
|
def test_bad_messages_to_mv(kafka_cluster, max_retries=20):
|
||||||
|
@ -1,20 +1,20 @@
|
|||||||
Cannot parse input: expected \'{\' before: \'Error 0\' Error 0 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 0\': (at row 1)\n Error 0 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 1\' Error 1 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 1\': (at row 1)\n Error 1 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 2\' Error 2 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 2\': (at row 1)\n Error 2 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 3\' Error 3 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 3\': (at row 1)\n Error 3 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 4\' Error 4 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 4\': (at row 1)\n Error 4 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 5\' Error 5 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 5\': (at row 1)\n Error 5 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 6\' Error 6 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 6\': (at row 1)\n Error 6 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 7\' Error 7 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 7\': (at row 1)\n Error 7 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 8\' Error 8 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 8\': (at row 1)\n Error 8 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 9\' Error 9 a.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 9\': (at row 1)\n Error 9 a.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 10\' Error 10 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 10\': (at row 1)\n Error 10 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 11\' Error 11 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 11\': (at row 1)\n Error 11 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 12\' Error 12 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 12\': (at row 1)\n Error 12 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 13\' Error 13 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 13\': (at row 1)\n Error 13 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 14\' Error 14 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 14\': (at row 1)\n Error 14 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 15\' Error 15 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 15\': (at row 1)\n Error 15 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 16\' Error 16 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 16\': (at row 1)\n Error 16 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 17\' Error 17 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 17\': (at row 1)\n Error 17 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 18\' Error 18 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 18\': (at row 1)\n Error 18 b.jsonl
|
||||||
Cannot parse input: expected \'{\' before: \'Error 19\' Error 19 b.jsonl
|
Cannot parse input: expected \'{\' before: \'Error 19\': (at row 1)\n Error 19 b.jsonl
|
||||||
|
Loading…
Reference in New Issue
Block a user