Read less unnecessary data from Parquet files

This commit is contained in:
Michael Kolupaev 2023-03-13 18:51:56 +00:00
parent 0dab82c420
commit dc6e34075e
4 changed files with 22 additions and 3 deletions

View File

@ -638,7 +638,7 @@ class IColumn;
M(Int64, read_priority, 0, "Priority to read data from local filesystem. Only supported for 'pread_threadpool' method.", 0) \ M(Int64, read_priority, 0, "Priority to read data from local filesystem. Only supported for 'pread_threadpool' method.", 0) \
M(UInt64, merge_tree_min_rows_for_concurrent_read_for_remote_filesystem, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \ M(UInt64, merge_tree_min_rows_for_concurrent_read_for_remote_filesystem, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \
M(UInt64, merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \ M(UInt64, merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized, when reading from remote filesystem.", 0) \
M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes required for remote read (url, s3) to do seek, instead for read with ignore.", 0) \ M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes required for remote read (url, s3) to do seek, instead of read with ignore.", 0) \
\ \
M(UInt64, async_insert_threads, 16, "Maximum number of threads to actually parse and insert data in background. Zero means asynchronous mode is disabled", 0) \ M(UInt64, async_insert_threads, 16, "Maximum number of threads to actually parse and insert data in background. Zero means asynchronous mode is disabled", 0) \
M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. Makes sense only for inserts via HTTP protocol. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \ M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. Makes sense only for inserts via HTTP protocol. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \

View File

@ -270,7 +270,25 @@ void ReadBufferFromS3::setReadUntilPosition(size_t position)
if (position != static_cast<size_t>(read_until_position)) if (position != static_cast<size_t>(read_until_position))
{ {
read_until_position = position; read_until_position = position;
impl.reset(); if (impl)
{
// Not exactly a seek, but close enough.
ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
impl.reset();
}
}
}
void ReadBufferFromS3::setReadUntilEnd()
{
if (read_until_position)
{
read_until_position = 0;
if (impl)
{
ProfileEvents::increment(ProfileEvents::ReadBufferSeekCancelConnection);
impl.reset();
}
} }
} }

View File

@ -69,6 +69,7 @@ public:
size_t getFileSize() override; size_t getFileSize() override;
void setReadUntilPosition(size_t position) override; void setReadUntilPosition(size_t position) override;
void setReadUntilEnd() override;
Range getRemainingReadRange() const override; Range getRemainingReadRange() const override;

View File

@ -16,7 +16,7 @@ namespace ErrorCodes
} }
/// Base class for schema inference for the data in some specific format. /// Base class for schema inference for the data in some specific format.
/// It reads some data from read buffer and try to determine the schema /// It reads some data from read buffer and tries to determine the schema
/// from read data. /// from read data.
class ISchemaReader class ISchemaReader
{ {