2023-07-31 08:50:09 +00:00
# include <IO/Archives/LibArchiveReader.h>
# include <IO/ReadBufferFromFileBase.h>
2023-05-29 17:10:03 +00:00
# include <Common/quoteString.h>
2023-07-28 11:55:23 +00:00
# include <IO/Archives/ArchiveUtils.h>
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
2023-05-29 20:08:18 +00:00
namespace DB
{
2023-07-28 13:00:35 +00:00
# if USE_LIBARCHIVE
2023-05-29 20:08:18 +00:00
namespace ErrorCodes
{
extern const int CANNOT_UNPACK_ARCHIVE ;
extern const int LOGICAL_ERROR ;
extern const int SEEK_POSITION_OUT_OF_BOUND ;
2023-07-31 08:50:09 +00:00
extern const int NOT_IMPLEMENTED ;
2023-07-31 14:53:40 +00:00
extern const int CANNOT_READ_ALL_DATA ;
2023-08-01 13:48:49 +00:00
extern const int UNSUPPORTED_METHOD ;
2023-05-29 20:08:18 +00:00
}
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
class LibArchiveReader < ArchiveInfo > : : Handle
2023-05-29 20:08:18 +00:00
{
2023-05-29 17:10:03 +00:00
public :
2023-07-28 11:55:23 +00:00
explicit Handle ( const String & path_to_archive_ ) : path_to_archive ( path_to_archive_ )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
current_archive = open ( path_to_archive ) ;
current_entry = archive_entry_new ( ) ;
}
Handle ( const Handle & ) = delete ;
Handle ( Handle & & other ) noexcept
: current_archive ( other . current_archive )
, current_entry ( other . current_entry )
{
other . current_archive = nullptr ;
other . current_entry = nullptr ;
2023-05-29 17:10:03 +00:00
}
2023-07-31 08:50:09 +00:00
2023-05-29 20:08:18 +00:00
~ Handle ( )
{
2023-08-01 13:48:49 +00:00
if ( current_archive )
{
archive_read_close ( current_archive ) ;
archive_read_free ( current_archive ) ;
}
2023-05-29 17:10:03 +00:00
}
2023-05-29 20:08:18 +00:00
bool locateFile ( const String & filename )
{
2023-08-01 13:48:49 +00:00
resetFileInfo ( ) ;
int err = ARCHIVE_OK ;
while ( true )
2023-05-29 17:10:03 +00:00
{
2023-08-01 13:48:49 +00:00
err = archive_read_next_header ( current_archive , & current_entry ) ;
if ( err = = ARCHIVE_RETRY )
continue ;
if ( err ! = ARCHIVE_OK )
break ;
if ( archive_entry_pathname ( current_entry ) = = filename )
2023-05-29 17:10:03 +00:00
return true ;
}
2023-08-01 13:48:49 +00:00
checkError ( err ) ;
2023-05-29 17:10:03 +00:00
return false ;
}
2023-08-01 13:48:49 +00:00
bool nextFile ( )
{
resetFileInfo ( ) ;
int err = ARCHIVE_OK ;
do
{
err = archive_read_next_header ( current_archive , & current_entry ) ;
} while ( err = = ARCHIVE_RETRY ) ;
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
checkError ( err ) ;
return err = = ARCHIVE_OK ;
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
static struct archive * open ( const String & path_to_archive )
2023-05-29 17:10:03 +00:00
{
2023-08-01 13:48:49 +00:00
auto * archive = archive_read_new ( ) ;
archive_read_support_filter_all ( archive ) ;
archive_read_support_format_all ( archive ) ;
if ( archive_read_open_filename ( archive , path_to_archive . c_str ( ) , 10240 ) ! = ARCHIVE_OK )
throw Exception ( ErrorCodes : : CANNOT_UNPACK_ARCHIVE , " Couldn't open {} archive: {} " , ArchiveInfo : : name , quoteString ( path_to_archive ) ) ;
return archive ;
2023-05-29 17:10:03 +00:00
}
2023-08-01 13:48:49 +00:00
std : : vector < std : : string > getAllFiles ( )
2023-05-29 17:10:03 +00:00
{
2023-08-01 13:48:49 +00:00
auto * archive = open ( path_to_archive ) ;
auto * entry = archive_entry_new ( ) ;
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
std : : vector < std : : string > files ;
int error = archive_read_next_header ( archive , & entry ) ;
while ( error = = ARCHIVE_OK | | error = = ARCHIVE_RETRY )
{
files . push_back ( archive_entry_pathname ( entry ) ) ;
error = archive_read_next_header ( archive , & entry ) ;
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
archive_read_close ( archive ) ;
archive_read_free ( archive ) ;
checkError ( error ) ;
return files ;
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
void checkError ( int error )
{
if ( error = = ARCHIVE_FATAL )
throw Exception ( ErrorCodes : : CANNOT_UNPACK_ARCHIVE , " Failed to read archive while fetching all files: {} " , archive_error_string ( current_archive ) ) ;
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
void resetFileInfo ( )
{
file_name . reset ( ) ;
file_info . reset ( ) ;
}
const String & getFileName ( ) const
{
if ( ! file_name )
file_name . emplace ( archive_entry_pathname ( current_entry ) ) ;
return * file_name ;
}
const FileInfo & getFileInfo ( ) const
{
if ( ! file_info )
2023-05-29 17:10:03 +00:00
{
2023-08-01 13:48:49 +00:00
file_info . emplace ( ) ;
file_info - > uncompressed_size = archive_entry_size ( current_entry ) ;
file_info - > compressed_size = archive_entry_size ( current_entry ) ;
file_info - > is_encrypted = false ;
2023-05-29 17:10:03 +00:00
}
2023-08-01 13:48:49 +00:00
return * file_info ;
}
struct archive * current_archive ;
struct archive_entry * current_entry ;
private :
const String path_to_archive ;
mutable std : : optional < String > file_name ;
mutable std : : optional < FileInfo > file_info ;
} ;
template < typename ArchiveInfo >
class LibArchiveReader < ArchiveInfo > : : FileEnumeratorImpl : public FileEnumerator
{
public :
explicit FileEnumeratorImpl ( Handle handle_ ) : handle ( std : : move ( handle_ ) ) { }
const String & getFileName ( ) const override { return handle . getFileName ( ) ; }
const FileInfo & getFileInfo ( ) const override { return handle . getFileInfo ( ) ; }
bool nextFile ( ) override { return handle . nextFile ( ) ; }
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
/// Releases owned handle to pass it to a read buffer.
Handle releaseHandle ( ) & & { return std : : move ( handle ) ; }
private :
Handle handle ;
} ;
template < typename ArchiveInfo >
class LibArchiveReader < ArchiveInfo > : : ReadBufferFromLibArchive : public ReadBufferFromFileBase
{
public :
explicit ReadBufferFromLibArchive ( Handle handle_ , std : : string path_to_archive_ )
: ReadBufferFromFileBase ( DBMS_DEFAULT_BUFFER_SIZE , nullptr , 0 )
, handle ( std : : move ( handle_ ) )
, path_to_archive ( std : : move ( path_to_archive_ ) )
{ }
off_t seek ( off_t /* off */ , int /* whence */ ) override
{
throw Exception ( ErrorCodes : : UNSUPPORTED_METHOD , " Seek is not supported when reading from archive " ) ;
2023-05-29 17:10:03 +00:00
}
2023-08-01 13:48:49 +00:00
off_t getPosition ( ) override
{
throw Exception ( ErrorCodes : : UNSUPPORTED_METHOD , " getPosition not supported when reading from archive " ) ;
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
String getFileName ( ) const override { return handle . getFileName ( ) ; }
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
Handle releaseHandle ( ) & &
{
return std : : move ( handle ) ;
}
2023-05-29 17:10:03 +00:00
private :
bool nextImpl ( ) override
{
2023-08-01 13:48:49 +00:00
auto bytes_read = archive_read_data ( handle . current_archive , internal_buffer . begin ( ) , static_cast < int > ( internal_buffer . size ( ) ) ) ;
2023-05-29 17:10:03 +00:00
2023-07-31 13:27:06 +00:00
if ( bytes_read < 0 )
2023-08-01 13:48:49 +00:00
throw Exception ( ErrorCodes : : CANNOT_READ_ALL_DATA , " Failed to read file {} from {}: {} " , handle . getFileName ( ) , path_to_archive , archive_error_string ( handle . current_archive ) ) ;
2023-07-31 13:27:06 +00:00
2023-05-29 17:10:03 +00:00
if ( ! bytes_read )
return false ;
2023-08-01 13:48:49 +00:00
total_bytes_read + = bytes ;
2023-05-29 17:10:03 +00:00
working_buffer = internal_buffer ;
working_buffer . resize ( bytes_read ) ;
return true ;
}
2023-07-31 13:27:06 +00:00
2023-05-29 17:10:03 +00:00
Handle handle ;
const String path_to_archive ;
2023-08-01 13:48:49 +00:00
size_t total_bytes_read = 0 ;
2023-05-29 17:10:03 +00:00
} ;
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
LibArchiveReader < ArchiveInfo > : : LibArchiveReader ( const String & path_to_archive_ ) : path_to_archive ( path_to_archive_ )
2023-08-01 13:48:49 +00:00
{ }
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
2023-08-01 13:48:49 +00:00
LibArchiveReader < ArchiveInfo > : : ~ LibArchiveReader ( ) = default ;
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
2023-08-01 13:48:49 +00:00
const std : : string & LibArchiveReader < ArchiveInfo > : : getPath ( ) const
{
return path_to_archive ;
}
2023-07-28 11:55:23 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
bool LibArchiveReader < ArchiveInfo > : : fileExists ( const String & filename )
2023-05-29 20:08:18 +00:00
{
Handle handle ( path_to_archive ) ;
return handle . locateFile ( filename ) ;
}
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
LibArchiveReader < ArchiveInfo > : : FileInfo LibArchiveReader < ArchiveInfo > : : getFileInfo ( const String & filename )
2023-05-29 20:08:18 +00:00
{
Handle handle ( path_to_archive ) ;
handle . locateFile ( filename ) ;
2023-08-01 13:48:49 +00:00
return handle . getFileInfo ( ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
std : : unique_ptr < typename LibArchiveReader < ArchiveInfo > : : FileEnumerator > LibArchiveReader < ArchiveInfo > : : firstFile ( )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
Handle handle ( path_to_archive ) ;
if ( ! handle . nextFile ( ) )
return nullptr ;
return std : : make_unique < FileEnumeratorImpl > ( std : : move ( handle ) ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
std : : unique_ptr < ReadBufferFromFileBase > LibArchiveReader < ArchiveInfo > : : readFile ( const String & filename )
2023-05-29 20:08:18 +00:00
{
Handle handle ( path_to_archive ) ;
handle . locateFile ( filename ) ;
2023-08-01 13:48:49 +00:00
return std : : make_unique < ReadBufferFromLibArchive > ( std : : move ( handle ) , path_to_archive ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
2023-08-01 13:48:49 +00:00
std : : unique_ptr < ReadBufferFromFileBase > LibArchiveReader < ArchiveInfo > : : readFile ( std : : unique_ptr < FileEnumerator > enumerator )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
if ( ! dynamic_cast < FileEnumeratorImpl * > ( enumerator . get ( ) ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Wrong enumerator passed to readFile() " ) ;
auto enumerator_impl = std : : unique_ptr < FileEnumeratorImpl > ( static_cast < FileEnumeratorImpl * > ( enumerator . release ( ) ) ) ;
auto handle = std : : move ( * enumerator_impl ) . releaseHandle ( ) ;
return std : : make_unique < ReadBufferFromLibArchive > ( std : : move ( handle ) , path_to_archive ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-08-01 13:48:49 +00:00
template < typename ArchiveInfo > std : : unique_ptr < typename LibArchiveReader < ArchiveInfo > : : FileEnumerator >
LibArchiveReader < ArchiveInfo > : : nextFile ( std : : unique_ptr < ReadBuffer > read_buffer )
2023-05-29 20:08:18 +00:00
{
2023-08-01 13:48:49 +00:00
if ( ! dynamic_cast < ReadBufferFromLibArchive * > ( read_buffer . get ( ) ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Wrong ReadBuffer passed to nextFile() " ) ;
auto read_buffer_from_libarchive = std : : unique_ptr < ReadBufferFromLibArchive > ( static_cast < ReadBufferFromLibArchive * > ( read_buffer . release ( ) ) ) ;
auto handle = std : : move ( * read_buffer_from_libarchive ) . releaseHandle ( ) ;
if ( ! handle . nextFile ( ) )
return nullptr ;
return std : : make_unique < FileEnumeratorImpl > ( std : : move ( handle ) ) ;
2023-05-29 20:08:18 +00:00
}
2023-08-01 13:48:49 +00:00
template < typename ArchiveInfo >
std : : vector < std : : string > LibArchiveReader < ArchiveInfo > : : getAllFiles ( )
{
Handle handle ( path_to_archive ) ;
return handle . getAllFiles ( ) ;
}
2023-05-29 20:08:18 +00:00
2023-07-31 08:50:09 +00:00
template < typename ArchiveInfo >
void LibArchiveReader < ArchiveInfo > : : setPassword ( const String & /*password_*/ )
2023-05-29 20:08:18 +00:00
{
2023-07-31 08:50:09 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Can not set password to {} archive " , ArchiveInfo : : name ) ;
2023-05-29 20:08:18 +00:00
}
2023-05-29 17:10:03 +00:00
2023-07-31 08:50:09 +00:00
template class LibArchiveReader < TarArchiveInfo > ;
template class LibArchiveReader < SevenZipArchiveInfo > ;
2023-07-28 13:00:35 +00:00
# endif
2023-05-29 17:10:03 +00:00
}