Unbreak reading from web servers that don't support HEAD requests

This commit is contained in:
Michael Kolupaev 2023-04-17 04:57:17 +00:00
parent 473f212c82
commit bd426a7d6d
2 changed files with 40 additions and 13 deletions

View File

@ -91,6 +91,7 @@ namespace detail
class ReadWriteBufferFromHTTPBase : public SeekableReadBuffer, public WithFileName, public WithFileSize
{
public:
/// Information from HTTP response header.
struct FileInfo
{
// nullopt if the server doesn't report it.
@ -796,7 +797,24 @@ namespace detail
FileInfo getFileInfo()
{
Poco::Net::HTTPResponse response;
getHeadResponse(response);
try
{
getHeadResponse(response);
}
catch (HTTPException & e)
{
/// Maybe the web server doesn't support HEAD requests.
/// E.g. webhdfs reports status 400.
/// We should proceed in hopes that the actual GET request will succeed.
/// (Unless the error in transient. Don't want to nondeterministically sometimes
/// fall back to slow whole-file reads when HEAD is actually supported; that sounds
/// like a nightmare to debug.)
if (e.getHTTPStatus() >= 400 && e.getHTTPStatus() <= 499 &&
e.getHTTPStatus() != Poco::Net::HTTPResponse::HTTP_TOO_MANY_REQUESTS)
return FileInfo{};
throw;
}
return parseFileInfo(response, 0);
}

View File

@ -46,25 +46,34 @@ public:
virtual size_t getFileOffsetOfBufferEnd() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFileOffsetOfBufferEnd() not implemented"); }
// If true, setReadUntilPosition() guarantees that eof will be reported at the given position.
/// If true, setReadUntilPosition() guarantees that eof will be reported at the given position.
virtual bool supportsRightBoundedReads() const { return false; }
virtual bool isIntegratedWithFilesystemCache() const { return false; }
// Returns true if seek() actually works, false if seek() will always throw (or make subsequent
// nextImpl() calls throw).
//
// This is needed because:
// * Sometimes there's no cheap way to know in advance whether the buffer is really seekable.
// Specifically, HTTP read buffer needs to send a request to check whether the server
// supports byte ranges.
// * Sometimes when we create such buffer we don't know in advance whether we'll need it to be
// seekable or not. So we don't want to pay the price for this check in advance.
/// Returns true if seek() actually works, false if seek() will always throw (or make subsequent
/// nextImpl() calls throw).
///
/// This is needed because:
/// * Sometimes there's no cheap way to know in advance whether the buffer is really seekable.
/// Specifically, HTTP read buffer needs to send a request to check whether the server
/// supports byte ranges.
/// * Sometimes when we create such buffer we don't know in advance whether we'll need it to be
/// seekable or not. So we don't want to pay the price for this check in advance.
virtual bool checkIfActuallySeekable() { return true; }
};
// Useful for reading in parallel.
// The created read buffers may outlive the factory.
/// Useful for reading in parallel.
/// The created read buffers may outlive the factory.
///
/// There are 2 ways to use this:
/// (1) Never call seek() or getFileSize(), read the file sequentially.
/// For HTTP, this usually translates to just one HTTP request.
/// (2) Call checkIfActuallySeekable(), then:
/// a. If it returned false, go to (1). seek() and getFileSize() are not available (throw if called).
/// b. If it returned true, seek() and getFileSize() are available, knock yourself out.
/// For HTTP, checkIfActuallySeekable() sends a HEAD request and returns false if the web server
/// doesn't support ranges (or doesn't support HEAD requests).
class SeekableReadBufferFactory : public WithFileSize
{
public: