2023-07-31 08:50:09 +00:00
# include <IO/Archives/LibArchiveReader.h>
# include <IO/ReadBufferFromFileBase.h>
2023-05-29 17:10:03 +00:00
# include <Common/quoteString.h>
2023-08-04 08:59:55 +00:00
# include <Common/scope_guard_safe.h>
2023-05-29 17:10:03 +00:00
2023-07-28 11:55:23 +00:00
# include <IO/Archives/ArchiveUtils.h>
2023-05-29 17:10:03 +00:00
2023-08-04 08:59:55 +00:00
# include <mutex>
2023-07-31 08:50:09 +00:00
2023-05-29 20:08:18 +00:00
namespace DB
{
2023-07-28 13:00:35 +00:00
# if USE_LIBARCHIVE
2023-05-29 20:08:18 +00:00
namespace ErrorCodes
{
extern const int CANNOT_UNPACK_ARCHIVE ;
extern const int LOGICAL_ERROR ;
2023-07-31 14:53:40 +00:00
extern const int CANNOT_READ_ALL_DATA ;
2023-08-01 13:48:49 +00:00
extern const int UNSUPPORTED_METHOD ;
2023-05-29 20:08:18 +00:00
}
2023-07-31 08:50:09 +00:00
2023-08-04 08:59:55 +00:00
class LibArchiveReader : : Handle
2023-05-29 20:08:18 +00:00
{
2023-05-29 17:10:03 +00:00
public :
2023-08-04 08:59:55 +00:00
explicit Handle ( std : : string path_to_archive_ , bool lock_on_reading_ )
: path_to_archive ( path_to_archive_ ) , lock_on_reading ( lock_on_reading_ )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
current_archive = open ( path_to_archive ) ;
}
Handle ( const Handle & ) = delete ;
Handle ( Handle & & other ) noexcept
: current_archive ( other . current_archive )
, current_entry ( other . current_entry )
2023-08-04 13:57:18 +00:00
, lock_on_reading ( other . lock_on_reading )
2023-08-01 13:48:49 +00:00
{
other . current_archive = nullptr ;
other . current_entry = nullptr ;
2023-05-29 17:10:03 +00:00
}
2023-07-31 08:50:09 +00:00
2023-05-29 20:08:18 +00:00
~ Handle ( )
{
2023-08-04 08:59:55 +00:00
close ( current_archive ) ;
2023-05-29 17:10:03 +00:00
}
2023-08-02 07:40:59 +00:00
bool locateFile ( const std : : string & filename )
{
return locateFile ( [ & ] ( const std : : string & file ) { return file = = filename ; } ) ;
}
bool locateFile ( NameFilter filter )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
resetFileInfo ( ) ;
int err = ARCHIVE_OK ;
while ( true )
2023-05-29 17:10:03 +00:00
{
2023-08-04 08:59:55 +00:00
err = readNextHeader ( current_archive , & current_entry ) ;
2023-08-01 13:48:49 +00:00
if ( err = = ARCHIVE_RETRY )
continue ;
if ( err ! = ARCHIVE_OK )
break ;
2023-08-02 07:40:59 +00:00
if ( filter ( archive_entry_pathname ( current_entry ) ) )
2023-05-29 17:10:03 +00:00
return true ;
}
2023-08-01 13:48:49 +00:00
checkError ( err ) ;
2023-05-29 17:10:03 +00:00
return false ;
}
2023-08-01 13:48:49 +00:00
bool nextFile ( )
{
resetFileInfo ( ) ;
int err = ARCHIVE_OK ;
do
{
2023-08-04 08:59:55 +00:00
err = readNextHeader ( current_archive , & current_entry ) ;
2023-08-01 13:48:49 +00:00
} while ( err = = ARCHIVE_RETRY ) ;
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
checkError ( err ) ;
return err = = ARCHIVE_OK ;
}
2023-05-29 17:10:03 +00:00
2023-08-02 07:40:59 +00:00
std : : vector < std : : string > getAllFiles ( NameFilter filter )
2023-05-29 17:10:03 +00:00
{
2023-08-01 13:48:49 +00:00
auto * archive = open ( path_to_archive ) ;
2023-08-04 08:59:55 +00:00
SCOPE_EXIT (
close ( archive ) ;
) ;
2023-08-04 14:29:49 +00:00
struct archive_entry * entry = nullptr ;
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
std : : vector < std : : string > files ;
2023-08-04 08:59:55 +00:00
int error = readNextHeader ( archive , & entry ) ;
2023-08-01 13:48:49 +00:00
while ( error = = ARCHIVE_OK | | error = = ARCHIVE_RETRY )
{
2023-08-04 14:29:49 +00:00
chassert ( entry ! = nullptr ) ;
2023-08-02 07:40:59 +00:00
std : : string name = archive_entry_pathname ( entry ) ;
if ( ! filter | | filter ( name ) )
files . push_back ( std : : move ( name ) ) ;
2023-08-04 08:59:55 +00:00
error = readNextHeader ( archive , & entry ) ;
2023-08-01 13:48:49 +00:00
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
checkError ( error ) ;
return files ;
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
const String & getFileName ( ) const
{
2023-08-04 13:57:18 +00:00
chassert ( current_entry ) ;
2023-08-01 13:48:49 +00:00
if ( ! file_name )
file_name . emplace ( archive_entry_pathname ( current_entry ) ) ;
return * file_name ;
}
const FileInfo & getFileInfo ( ) const
{
2023-08-04 13:57:18 +00:00
chassert ( current_entry ) ;
2023-08-01 13:48:49 +00:00
if ( ! file_info )
2023-05-29 17:10:03 +00:00
{
2023-08-01 13:48:49 +00:00
file_info . emplace ( ) ;
file_info - > uncompressed_size = archive_entry_size ( current_entry ) ;
file_info - > compressed_size = archive_entry_size ( current_entry ) ;
file_info - > is_encrypted = false ;
2023-05-29 17:10:03 +00:00
}
2023-08-01 13:48:49 +00:00
return * file_info ;
}
struct archive * current_archive ;
2023-08-04 13:57:18 +00:00
struct archive_entry * current_entry = nullptr ;
2023-08-01 13:48:49 +00:00
private :
2023-08-04 08:59:55 +00:00
void checkError ( int error ) const
{
if ( error = = ARCHIVE_FATAL )
throw Exception ( ErrorCodes : : CANNOT_UNPACK_ARCHIVE , " Failed to read archive while fetching all files: {} " , archive_error_string ( current_archive ) ) ;
}
void resetFileInfo ( )
{
file_name . reset ( ) ;
file_info . reset ( ) ;
}
static struct archive * open ( const String & path_to_archive )
{
auto * archive = archive_read_new ( ) ;
try
{
2024-02-20 05:48:53 +00:00
// Support for bzip2, gzip, lzip, xz, zstd and lz4
archive_read_support_filter_bzip2 ( archive ) ;
archive_read_support_filter_gzip ( archive ) ;
archive_read_support_filter_xz ( archive ) ;
archive_read_support_filter_lz4 ( archive ) ;
archive_read_support_filter_zstd ( archive ) ;
// Support tar, 7zip and zip
archive_read_support_format_tar ( archive ) ;
archive_read_support_format_7zip ( archive ) ;
archive_read_support_format_zip ( archive ) ;
2023-08-04 08:59:55 +00:00
if ( archive_read_open_filename ( archive , path_to_archive . c_str ( ) , 10240 ) ! = ARCHIVE_OK )
2023-08-09 10:04:29 +00:00
throw Exception ( ErrorCodes : : CANNOT_UNPACK_ARCHIVE , " Couldn't open archive {}: {} " , quoteString ( path_to_archive ) , archive_error_string ( archive ) ) ;
2023-08-04 08:59:55 +00:00
}
catch ( . . . )
{
close ( archive ) ;
throw ;
}
return archive ;
}
static void close ( struct archive * archive )
{
if ( archive )
{
archive_read_close ( archive ) ;
archive_read_free ( archive ) ;
}
}
int readNextHeader ( struct archive * archive , struct archive_entry * * entry ) const
{
std : : unique_lock lock ( Handle : : read_lock , std : : defer_lock ) ;
if ( lock_on_reading )
lock . lock ( ) ;
return archive_read_next_header ( archive , entry ) ;
}
2023-08-01 13:48:49 +00:00
const String path_to_archive ;
2023-08-04 08:59:55 +00:00
/// for some archive types when we are reading headers static variables are used
/// which are not thread-safe
2023-08-04 13:57:18 +00:00
const bool lock_on_reading ;
2023-08-04 08:59:55 +00:00
static inline std : : mutex read_lock ;
2023-08-01 13:48:49 +00:00
mutable std : : optional < String > file_name ;
mutable std : : optional < FileInfo > file_info ;
} ;
2023-08-04 08:59:55 +00:00
class LibArchiveReader : : FileEnumeratorImpl : public FileEnumerator
2023-08-01 13:48:49 +00:00
{
public :
explicit FileEnumeratorImpl ( Handle handle_ ) : handle ( std : : move ( handle_ ) ) { }
const String & getFileName ( ) const override { return handle . getFileName ( ) ; }
const FileInfo & getFileInfo ( ) const override { return handle . getFileInfo ( ) ; }
bool nextFile ( ) override { return handle . nextFile ( ) ; }
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
/// Releases owned handle to pass it to a read buffer.
Handle releaseHandle ( ) & & { return std : : move ( handle ) ; }
private :
Handle handle ;
} ;
2023-08-04 08:59:55 +00:00
class LibArchiveReader : : ReadBufferFromLibArchive : public ReadBufferFromFileBase
2023-08-01 13:48:49 +00:00
{
public :
explicit ReadBufferFromLibArchive ( Handle handle_ , std : : string path_to_archive_ )
: ReadBufferFromFileBase ( DBMS_DEFAULT_BUFFER_SIZE , nullptr , 0 )
, handle ( std : : move ( handle_ ) )
, path_to_archive ( std : : move ( path_to_archive_ ) )
{ }
off_t seek ( off_t /* off */ , int /* whence */ ) override
{
throw Exception ( ErrorCodes : : UNSUPPORTED_METHOD , " Seek is not supported when reading from archive " ) ;
2023-05-29 17:10:03 +00:00
}
2024-01-19 01:46:07 +00:00
bool checkIfActuallySeekable ( ) override { return false ; }
2023-08-01 13:48:49 +00:00
off_t getPosition ( ) override
{
2024-02-13 11:02:46 +00:00
throw Exception ( ErrorCodes : : UNSUPPORTED_METHOD , " getPosition not supported when reading from archive " ) ;
2023-08-01 13:48:49 +00:00
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
String getFileName ( ) const override { return handle . getFileName ( ) ; }
2023-05-29 17:10:03 +00:00
2023-08-23 08:10:30 +00:00
size_t getFileSize ( ) override { return handle . getFileInfo ( ) . uncompressed_size ; }
2023-08-01 13:48:49 +00:00
Handle releaseHandle ( ) & &
{
return std : : move ( handle ) ;
}
2023-05-29 17:10:03 +00:00
private :
bool nextImpl ( ) override
{
2023-08-01 13:48:49 +00:00
auto bytes_read = archive_read_data ( handle . current_archive , internal_buffer . begin ( ) , static_cast < int > ( internal_buffer . size ( ) ) ) ;
2023-05-29 17:10:03 +00:00
2023-07-31 13:27:06 +00:00
if ( bytes_read < 0 )
2023-08-01 13:48:49 +00:00
throw Exception ( ErrorCodes : : CANNOT_READ_ALL_DATA , " Failed to read file {} from {}: {} " , handle . getFileName ( ) , path_to_archive , archive_error_string ( handle . current_archive ) ) ;
2023-07-31 13:27:06 +00:00
2023-05-29 17:10:03 +00:00
if ( ! bytes_read )
return false ;
2023-08-01 13:48:49 +00:00
total_bytes_read + = bytes ;
2023-05-29 17:10:03 +00:00
working_buffer = internal_buffer ;
working_buffer . resize ( bytes_read ) ;
return true ;
}
2023-07-31 13:27:06 +00:00
2023-05-29 17:10:03 +00:00
Handle handle ;
const String path_to_archive ;
2023-08-01 13:48:49 +00:00
size_t total_bytes_read = 0 ;
2023-05-29 17:10:03 +00:00
} ;
2023-08-04 08:59:55 +00:00
LibArchiveReader : : LibArchiveReader ( std : : string archive_name_ , bool lock_on_reading_ , std : : string path_to_archive_ )
: archive_name ( std : : move ( archive_name_ ) ) , lock_on_reading ( lock_on_reading_ ) , path_to_archive ( std : : move ( path_to_archive_ ) )
2023-08-01 13:48:49 +00:00
{ }
2023-05-29 17:10:03 +00:00
2023-08-04 08:59:55 +00:00
LibArchiveReader : : ~ LibArchiveReader ( ) = default ;
2023-05-29 17:10:03 +00:00
2023-08-04 08:59:55 +00:00
const std : : string & LibArchiveReader : : getPath ( ) const
2023-08-01 13:48:49 +00:00
{
return path_to_archive ;
}
2023-07-28 11:55:23 +00:00
2023-08-04 08:59:55 +00:00
bool LibArchiveReader : : fileExists ( const String & filename )
2023-05-29 20:08:18 +00:00
{
2023-08-04 08:59:55 +00:00
Handle handle ( path_to_archive , lock_on_reading ) ;
2023-05-29 20:08:18 +00:00
return handle . locateFile ( filename ) ;
}
2023-05-29 17:10:03 +00:00
2023-08-04 08:59:55 +00:00
LibArchiveReader : : FileInfo LibArchiveReader : : getFileInfo ( const String & filename )
2023-05-29 20:08:18 +00:00
{
2023-08-04 08:59:55 +00:00
Handle handle ( path_to_archive , lock_on_reading ) ;
2023-08-03 10:15:02 +00:00
if ( ! handle . locateFile ( filename ) )
throw Exception ( ErrorCodes : : CANNOT_UNPACK_ARCHIVE , " Couldn't unpack archive {}: file not found " , path_to_archive ) ;
2023-08-01 13:48:49 +00:00
return handle . getFileInfo ( ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-08-04 08:59:55 +00:00
std : : unique_ptr < LibArchiveReader : : FileEnumerator > LibArchiveReader : : firstFile ( )
2023-05-29 20:08:18 +00:00
{
2023-08-04 08:59:55 +00:00
Handle handle ( path_to_archive , lock_on_reading ) ;
2023-08-01 13:48:49 +00:00
if ( ! handle . nextFile ( ) )
return nullptr ;
return std : : make_unique < FileEnumeratorImpl > ( std : : move ( handle ) ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-08-09 10:04:29 +00:00
std : : unique_ptr < ReadBufferFromFileBase > LibArchiveReader : : readFile ( const String & filename , bool throw_on_not_found )
2023-08-02 07:40:59 +00:00
{
2023-08-09 10:04:29 +00:00
return readFile ( [ & ] ( const std : : string & file ) { return file = = filename ; } , throw_on_not_found ) ;
2023-08-02 07:40:59 +00:00
}
2023-08-09 10:04:29 +00:00
std : : unique_ptr < ReadBufferFromFileBase > LibArchiveReader : : readFile ( NameFilter filter , bool throw_on_not_found )
2023-05-29 20:08:18 +00:00
{
2023-08-04 08:59:55 +00:00
Handle handle ( path_to_archive , lock_on_reading ) ;
2023-08-03 10:15:02 +00:00
if ( ! handle . locateFile ( filter ) )
2023-08-09 10:04:29 +00:00
{
if ( throw_on_not_found )
throw Exception (
ErrorCodes : : CANNOT_UNPACK_ARCHIVE , " Couldn't unpack archive {}: no file found satisfying the filter " , path_to_archive ) ;
return nullptr ;
}
2023-08-01 13:48:49 +00:00
return std : : make_unique < ReadBufferFromLibArchive > ( std : : move ( handle ) , path_to_archive ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-08-04 08:59:55 +00:00
std : : unique_ptr < ReadBufferFromFileBase > LibArchiveReader : : readFile ( std : : unique_ptr < FileEnumerator > enumerator )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
if ( ! dynamic_cast < FileEnumeratorImpl * > ( enumerator . get ( ) ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Wrong enumerator passed to readFile() " ) ;
auto enumerator_impl = std : : unique_ptr < FileEnumeratorImpl > ( static_cast < FileEnumeratorImpl * > ( enumerator . release ( ) ) ) ;
auto handle = std : : move ( * enumerator_impl ) . releaseHandle ( ) ;
return std : : make_unique < ReadBufferFromLibArchive > ( std : : move ( handle ) , path_to_archive ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-08-04 08:59:55 +00:00
std : : unique_ptr < LibArchiveReader : : FileEnumerator > LibArchiveReader : : nextFile ( std : : unique_ptr < ReadBuffer > read_buffer )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
if ( ! dynamic_cast < ReadBufferFromLibArchive * > ( read_buffer . get ( ) ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Wrong ReadBuffer passed to nextFile() " ) ;
auto read_buffer_from_libarchive = std : : unique_ptr < ReadBufferFromLibArchive > ( static_cast < ReadBufferFromLibArchive * > ( read_buffer . release ( ) ) ) ;
auto handle = std : : move ( * read_buffer_from_libarchive ) . releaseHandle ( ) ;
if ( ! handle . nextFile ( ) )
return nullptr ;
return std : : make_unique < FileEnumeratorImpl > ( std : : move ( handle ) ) ;
2023-05-29 20:08:18 +00:00
}
2024-01-22 22:55:50 +00:00
std : : unique_ptr < LibArchiveReader : : FileEnumerator > LibArchiveReader : : currentFile ( std : : unique_ptr < ReadBuffer > read_buffer )
{
if ( ! dynamic_cast < ReadBufferFromLibArchive * > ( read_buffer . get ( ) ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Wrong ReadBuffer passed to nextFile() " ) ;
auto read_buffer_from_libarchive = std : : unique_ptr < ReadBufferFromLibArchive > ( static_cast < ReadBufferFromLibArchive * > ( read_buffer . release ( ) ) ) ;
auto handle = std : : move ( * read_buffer_from_libarchive ) . releaseHandle ( ) ;
return std : : make_unique < FileEnumeratorImpl > ( std : : move ( handle ) ) ;
}
2023-08-04 08:59:55 +00:00
std : : vector < std : : string > LibArchiveReader : : getAllFiles ( )
2023-08-02 07:40:59 +00:00
{
return getAllFiles ( { } ) ;
}
2023-08-04 08:59:55 +00:00
std : : vector < std : : string > LibArchiveReader : : getAllFiles ( NameFilter filter )
2023-08-01 13:48:49 +00:00
{
2023-08-04 08:59:55 +00:00
Handle handle ( path_to_archive , lock_on_reading ) ;
2023-08-02 07:40:59 +00:00
return handle . getAllFiles ( filter ) ;
2023-08-01 13:48:49 +00:00
}
2023-05-29 20:08:18 +00:00
2023-08-04 08:59:55 +00:00
void LibArchiveReader : : setPassword ( const String & /*password_*/ )
2023-05-29 20:08:18 +00:00
{
2023-08-04 08:59:55 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Can not set password to {} archive " , archive_name ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-07-28 13:00:35 +00:00
# endif
2023-05-29 17:10:03 +00:00
}