1
0
Fork 0

Merging upstream version 0.10.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-17 21:11:12 +01:00
parent e076fdd679
commit 060c1457b6
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
21 changed files with 633 additions and 443 deletions

View file

@ -1,3 +1,10 @@
2019-01-31 Antonio Diaz Diaz <antonio@gnu.org>
* Version 0.10 released.
* Added new option '--bsolid'.
* Added new option '-B, --data-size'.
* create.cc: Set ustar name to zero if extended header is used.
2019-01-22 Antonio Diaz Diaz <antonio@gnu.org>
* Version 0.9 released.

View file

@ -8,7 +8,7 @@ LIBS = -llz -lpthread
SHELL = /bin/sh
CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1
objs = arg_parser.o lzip_index.o create.o extract.o list_lz.o main.o
objs = arg_parser.o lzip_index.o create.o extended.o extract.o list_lz.o main.o
.PHONY : all install install-bin install-info install-man \
@ -30,10 +30,11 @@ main.o : main.cc
$(objs) : Makefile
arg_parser.o : arg_parser.h
create.o : arg_parser.h lzip.h tarlz.h
extract.o : arg_parser.h lzip.h lzip_index.h tarlz.h
list_lz.o : arg_parser.h lzip.h lzip_index.h tarlz.h
lzip_index.o : lzip.h lzip_index.h
create.o : arg_parser.h tarlz.h
extended.o : tarlz.h
extract.o : arg_parser.h lzip_index.h tarlz.h
list_lz.o : arg_parser.h lzip_index.h tarlz.h
lzip_index.o : lzip_index.h tarlz.h
main.o : arg_parser.h tarlz.h
@ -123,6 +124,9 @@ dist : doc
$(DISTNAME)/testsuite/test_bad1.txt.tar \
$(DISTNAME)/testsuite/test_bad[12].txt \
$(DISTNAME)/testsuite/t155.tar \
$(DISTNAME)/testsuite/rfoo \
$(DISTNAME)/testsuite/rbar \
$(DISTNAME)/testsuite/rbaz \
$(DISTNAME)/testsuite/test3.tar \
$(DISTNAME)/testsuite/test3_bad[1-5].tar \
$(DISTNAME)/testsuite/test.txt.lz \

25
NEWS
View file

@ -1,16 +1,15 @@
Changes in version 0.9:
Changes in version 0.10:
Multi-threaded '-t, --list' has been implemented. See chapter 'Limitations
of parallel tar decoding' in the manual for details.
The new option '--bsolid', which selects per-data-block compression of the
archive, has been added. This option improves compression efficiency for
archives with lots of small files.
The new option '-n, --threads', which sets the number of decompression
threads, has been added.
The new option '-B, --data-size', which sets the size of the input data
blocks for '--bsolid', has been added.
Tarlz now recognizes global pax headers, but for now ignores them.
Tarlz now decodes numerical fields in headers using length-safe parsers
instead of strtoul to prevent the parser from exceeding the end of the field
if it does not contain a terminating character.
The new chapter 'Limitations of parallel tar decoding' has been added to the
manual.
If an extended header is required for any reason (for example a file size
larger than 8 GiB or a link name longer than 100 bytes), tarlz now moves the
filename also to the extended header to prevent an ustar tool from trying to
extract the file or link. This also makes easier during parallel extraction
or listing the detection of a tar member split between two lzip members at
the boundary between the extended header and the ustar header.

2
configure vendored
View file

@ -6,7 +6,7 @@
# to copy, distribute and modify it.
pkgname=tarlz
pkgversion=0.9
pkgversion=0.10
progname=tarlz
srctrigger=doc/${pkgname}.texi

138
create.cc
View file

@ -38,20 +38,21 @@
#include <lzlib.h>
#include "arg_parser.h"
#include "lzip.h"
#include "tarlz.h"
const CRC32C crc32c;
const CRC32 crc32c( true );
int cl_owner = -1; // global vars needed by add_member
int cl_group = -1;
int cl_data_size = 0;
Solidity solidity = no_solid;
namespace {
LZ_Encoder * encoder = 0; // local vars needed by add_member
const char * archive_namep = 0;
unsigned long long partial_data_size = 0; // current block size
int outfd = -1;
int gretval = 0;
@ -150,17 +151,18 @@ bool check_appendable( const int fd, const bool remove_eof )
}
class File_is_archive
class File_is_the_archive
{
dev_t archive_dev;
ino_t archive_ino;
bool initialized;
public:
File_is_archive() : initialized( false ) {}
bool init()
File_is_the_archive() : initialized( false ) {}
bool init( const int fd )
{
struct stat st;
if( fstat( outfd, &st ) != 0 ) return false;
if( fstat( fd, &st ) != 0 ) return false;
if( S_ISREG( st.st_mode ) )
{ archive_dev = st.st_dev; archive_ino = st.st_ino; initialized = true; }
return true;
@ -169,7 +171,7 @@ public:
{
return initialized && archive_dev == st.st_dev && archive_ino == st.st_ino;
}
} file_is_archive;
} file_is_the_archive;
bool archive_write( const uint8_t * const buf, const int size )
@ -223,50 +225,32 @@ void print_octal( uint8_t * const buf, int size, unsigned long long num )
while( --size >= 0 ) { buf[size] = '0' + ( num % 8 ); num /= 8; }
}
unsigned decimal_digits( unsigned long long value )
{
unsigned digits = 1;
while( value >= 10 ) { value /= 10; ++digits; }
return digits;
}
int record_size( const unsigned keyword_size, const unsigned long value_size )
{
// size = ' ' + keyword + '=' + value + '\n'
unsigned long long size = 1 + keyword_size + 1 + value_size + 1;
const unsigned d1 = decimal_digits( size );
size += decimal_digits( d1 + size );
if( size >= INT_MAX ) size = 0; // overflows snprintf size
return size;
}
bool write_extended( const Extended & extended )
{
const int path_rec = extended.path.size() ?
record_size( 4, extended.path.size() ) : 0;
const int lpath_rec = extended.linkpath.size() ?
record_size( 8, extended.linkpath.size() ) : 0;
const int size_rec = ( extended.size > 0 ) ?
record_size( 4, decimal_digits( extended.size ) ) : 0;
const unsigned long long edsize = path_rec + lpath_rec + size_rec + 22;
const unsigned long long bufsize = round_up( edsize );
const int path_rec = extended.recsize_path();
const int lpath_rec = extended.recsize_linkpath();
const int size_rec = extended.recsize_file_size();
const unsigned long long edsize = extended.edsize();
const unsigned long long bufsize = extended.edsize_pad();
if( edsize >= 1ULL << 33 ) return false; // too much extended data
if( bufsize == 0 ) return edsize == 0; // overflow or no extended data
char * const buf = new char[bufsize+1]; // extended records buffer
unsigned long long pos = path_rec; // goto can't cross this
unsigned long long pos = path_rec; // goto can't cross these
const unsigned crc_size = Extended::crc_record.size();
if( path_rec && snprintf( buf, path_rec + 1, "%d path=%s\n",
path_rec, extended.path.c_str() ) != path_rec )
path_rec, extended.path().c_str() ) != path_rec )
goto error;
if( lpath_rec && snprintf( buf + pos, lpath_rec + 1, "%d linkpath=%s\n",
lpath_rec, extended.linkpath.c_str() ) != lpath_rec )
lpath_rec, extended.linkpath().c_str() ) != lpath_rec )
goto error;
pos += lpath_rec;
if( size_rec && snprintf( buf + pos, size_rec + 1, "%d size=%llu\n",
size_rec, extended.size ) != size_rec )
size_rec, extended.file_size() ) != size_rec )
goto error;
pos += size_rec;
if( snprintf( buf + pos, 23, "22 GNU.crc32=00000000\n" ) != 22 ) goto error;
pos += 22;
std::memcpy( buf + pos, Extended::crc_record.c_str(), crc_size );
pos += crc_size;
if( pos != edsize ) goto error;
print_hex( buf + edsize - 9, 8,
crc32c.windowed_crc( (const uint8_t *)buf, edsize - 9, edsize ) );
@ -316,27 +300,29 @@ const char * remove_leading_dotdot( const char * const filename )
}
// Return true if filename fits in the ustar header.
// Return true if it stores filename in the ustar header.
bool store_name( const char * const filename, Extended & extended,
Tar_header header )
Tar_header header, const bool force_extended_name )
{
const char * const stored_name = remove_leading_dotdot( filename );
const int len = std::strlen( stored_name );
enum { max_len = prefix_l + 1 + name_l }; // prefix + '/' + name
// first try storing filename in the ustar header
if( len <= name_l ) // stored_name fits in name
{ std::memcpy( header + name_o, stored_name, len ); return true; }
if( len <= max_len ) // find shortest prefix
for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i )
if( stored_name[i] == '/' ) // stored_name can be split
{
std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 );
std::memcpy( header + prefix_o, stored_name, i );
return true;
}
if( !force_extended_name ) // try storing filename in the ustar header
{
const int len = std::strlen( stored_name );
enum { max_len = prefix_l + 1 + name_l }; // prefix + '/' + name
if( len <= name_l ) // stored_name fits in name
{ std::memcpy( header + name_o, stored_name, len ); return true; }
if( len <= max_len ) // find shortest prefix
for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i )
if( stored_name[i] == '/' ) // stored_name can be split
{
std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 );
std::memcpy( header + prefix_o, stored_name, i );
return true;
}
}
// store filename in extended record, leave name zeroed in ustar header
extended.path = stored_name;
extended.path( stored_name );
return false;
}
@ -348,13 +334,13 @@ int add_member( const char * const filename, const struct stat *,
if( lstat( filename, &st ) != 0 )
{ show_file_error( filename, "Can't stat input file", errno );
gretval = 1; return 0; }
if( file_is_archive( st ) )
if( file_is_the_archive( st ) )
{ show_file_error( archive_namep, "File is the archive; not dumped." );
return 0; }
Extended extended; // metadata for extended records
Tar_header header;
init_tar_header( header );
store_name( filename, extended, header );
bool force_extended_name = false;
const mode_t mode = st.st_mode;
print_octal( header + mode_o, mode_l - 1,
@ -392,7 +378,8 @@ int add_member( const char * const filename, const struct stat *,
{
char * const buf = new char[st.st_size+1];
len = readlink( filename, buf, st.st_size );
if( len == st.st_size ) { buf[len] = 0; extended.linkpath = buf; }
if( len == st.st_size )
{ buf[len] = 0; extended.linkpath( buf ); force_extended_name = true; }
delete[] buf;
}
if( len != st.st_size )
@ -418,12 +405,30 @@ int add_member( const char * const filename, const struct stat *,
const struct group * const gr = getgrgid( gid );
if( gr && gr->gr_name )
std::strncpy( (char *)header + gname_o, gr->gr_name, gname_l - 1 );
if( file_size >= 1ULL << 33 ) extended.size = file_size;
if( file_size >= 1ULL << 33 )
{ extended.file_size( file_size ); force_extended_name = true; }
else print_octal( header + size_o, size_l - 1, file_size );
store_name( filename, extended, header, force_extended_name );
print_octal( header + chksum_o, chksum_l - 1, ustar_chksum( header ) );
const int infd = file_size ? open_instream( filename ) : -1;
if( file_size && infd < 0 ) { gretval = 1; return 0; }
if( encoder && solidity == bsolid )
{
const unsigned long long member_size =
header_size + extended.full_size() + round_up( file_size );
const unsigned long long target_size = cl_data_size;
if( partial_data_size >= target_size ||
( partial_data_size >= min_data_size &&
partial_data_size + member_size / 2 > target_size ) )
{
partial_data_size = member_size;
if( !archive_write( 0, 0 ) )
{ show_error( "Error flushing encoder", errno ); return 1; }
}
else partial_data_size += member_size;
}
if( !extended.empty() && !write_extended( extended ) )
{ show_error( "Error writing extended header", errno ); return 1; }
if( !archive_write( header, header_size ) )
@ -491,7 +496,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser,
{ show_error( "'--concatenate' is incompatible with '-f -'.", 0, true );
return 1; }
if( ( outfd = open_outstream( archive_name, false ) ) < 0 ) return 1;
if( !file_is_archive.init() )
if( !file_is_the_archive.init( outfd ) )
{ show_file_error( archive_name.c_str(), "Can't stat", errno ); return 1; }
int retval = 0;
@ -507,7 +512,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser,
{ show_file_error( filename, "Not an appendable tar.lz archive." );
close( infd ); retval = 2; break; }
struct stat st;
if( fstat( infd, &st ) == 0 && file_is_archive( st ) )
if( fstat( infd, &st ) == 0 && file_is_the_archive( st ) )
{ show_file_error( filename, "File is the archive; not concatenated." );
close( infd ); continue; }
if( !check_appendable( outfd, true ) )
@ -572,12 +577,18 @@ int encode( const std::string & archive_name, const Arg_parser & parser,
}
archive_namep = archive_name.size() ? archive_name.c_str() : "(stdout)";
if( !file_is_archive.init() )
if( !file_is_the_archive.init( outfd ) )
{ show_file_error( archive_namep, "Can't stat", errno ); return 1; }
if( compressed )
{
encoder = LZ_compress_open( option_mapping[level].dictionary_size,
const int dictionary_size = option_mapping[level].dictionary_size;
if( cl_data_size <= 0 )
{
if( level == 0 ) cl_data_size = 1 << 20;
else cl_data_size = 2 * dictionary_size;
}
encoder = LZ_compress_open( dictionary_size,
option_mapping[level].match_len_limit, LLONG_MAX );
if( !encoder || LZ_compress_errno( encoder ) != LZ_ok )
{
@ -619,7 +630,8 @@ int encode( const std::string & archive_name, const Arg_parser & parser,
enum { bufsize = 2 * header_size };
uint8_t buf[bufsize];
std::memset( buf, 0, bufsize );
if( encoder && solidity == asolid && !archive_write( 0, 0 ) )
if( encoder && ( solidity == asolid || solidity == bsolid ) &&
!archive_write( 0, 0 ) )
{ show_error( "Error flushing encoder", errno ); retval = 1; }
else if( !archive_write( buf, bufsize ) ||
( encoder && !archive_write( 0, 0 ) ) ) // flush encoder

View file

@ -1,5 +1,5 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1.
.TH TARLZ "1" "January 2019" "tarlz 0.9" "User Commands"
.TH TARLZ "1" "January 2019" "tarlz 0.10" "User Commands"
.SH NAME
tarlz \- creates tar archives with multimember lzip compression
.SH SYNOPSIS
@ -33,6 +33,9 @@ output version information and exit
\fB\-A\fR, \fB\-\-concatenate\fR
append tar.lz archives to the end of an archive
.TP
\fB\-B\fR, \fB\-\-data\-size=\fR<bytes>
set target size of input data blocks [2x8=16 MiB]
.TP
\fB\-c\fR, \fB\-\-create\fR
create a new archive
.TP
@ -66,6 +69,9 @@ set compression level [default 6]
\fB\-\-asolid\fR
create solidly compressed appendable archive
.TP
\fB\-\-bsolid\fR
create per\-data\-block compressed archive
.TP
\fB\-\-dsolid\fR
create per\-directory compressed archive
.TP

View file

@ -11,7 +11,7 @@ File: tarlz.info, Node: Top, Next: Introduction, Up: (dir)
Tarlz Manual
************
This manual is for Tarlz (version 0.9, 22 January 2019).
This manual is for Tarlz (version 0.10, 31 January 2019).
* Menu:
@ -120,6 +120,13 @@ archive 'foo'.
the archive if no FILES have been specified. Tarlz can't
concatenate uncompressed tar archives.
'-B BYTES'
'--data-size=BYTES'
Set target size of input data blocks for the '--bsolid' option.
Valid values range from 8 KiB to 1 GiB. Default value is two times
the dictionary size, except for option '-0' where it defaults to
1 MiB.
'-c'
'--create'
Create a new archive from FILES.
@ -190,6 +197,18 @@ archive 'foo'.
members it creates, reducing the amount of memory required for
decompression.
Level Dictionary size Match length limit
-0 64 KiB 16 bytes
-1 1 MiB 5 bytes
-2 1.5 MiB 6 bytes
-3 2 MiB 8 bytes
-4 3 MiB 12 bytes
-5 4 MiB 20 bytes
-6 8 MiB 36 bytes
-7 16 MiB 68 bytes
-8 24 MiB 132 bytes
-9 32 MiB 273 bytes
'--asolid'
When creating or appending to a compressed archive, use appendable
solid compression. All the files being added to the archive are
@ -197,6 +216,15 @@ archive 'foo'.
are compressed into a separate lzip member. This creates a solidly
compressed appendable archive.
'--bsolid'
When creating or appending to a compressed archive, compress tar
members together in a lzip member until they approximate a target
uncompressed size. The size can't be exact because each solidly
compressed data block must contain an integer number of tar
members. This option improves compression efficiency for archives
with lots of small files. *Note --data-size::, to set the target
block size.
'--dsolid'
When creating or appending to a compressed archive, use solid
compression for each directory especified in the command line. The
@ -560,13 +588,13 @@ old tar programs from extracting the extended records as a file in the
wrong place. Tarlz also sets to zero those fields of the ustar header
overridden by extended records.
If the extended header is needed because of a file size larger than
8 GiB, the size field will be unable to contain the full size of the
file. Therefore the file may be partially extracted, and the tool will
issue a spurious warning about a corrupt header at the point where it
thinks the file ends. Setting to zero the overridden size in the ustar
header at least prevents the partial extraction and makes obvious that
the file has been truncated.
If an extended header is required for any reason (for example a file
size larger than 8 GiB or a link name longer than 100 bytes), tarlz
moves the filename also to the extended header to prevent an ustar tool
from trying to extract the file or link. This also makes easier during
parallel extraction or listing the detection of a tar member split
between two lzip members at the boundary between the extended header
and the ustar header.
4.3 As simple as possible (but not simpler)
@ -626,10 +654,10 @@ to single-threaded mode and continues decoding the archive. Currently
only the '--list' option is able to do multi-threaded decoding.
If the files in the archive are large, multi-threaded '--list' on a
regular tar.lz archive can be hundreds of times faster than sequential
'--list' because, in addition to using several processors, it only
needs to decompress part of each lzip member. See the following example
listing the Silesia corpus on a dual core machine:
regular (seekable) tar.lz archive can be hundreds of times faster than
sequential '--list' because, in addition to using several processors,
it only needs to decompress part of each lzip member. See the following
example listing the Silesia corpus on a dual core machine:
tarlz -9 -cf silesia.tar.lz silesia
time lzip -cd silesia.tar.lz | tar -tf - (5.032s)
@ -690,9 +718,9 @@ Example 7: Extract files 'a' and 'c' from archive 'archive.tar.lz'.
Example 8: Copy the contents of directory 'sourcedir' to the directory
'targetdir'.
'destdir'.
tarlz -C sourcedir -c . | tarlz -C targetdir -x
tarlz -C sourcedir -c . | tarlz -C destdir -x

File: tarlz.info, Node: Problems, Next: Concept index, Prev: Examples, Up: Top
@ -734,17 +762,18 @@ Concept index

Tag Table:
Node: Top223
Node: Introduction1012
Node: Invoking tarlz3124
Node: File format10384
Ref: key_crc3215169
Node: Amendments to pax format20586
Ref: crc3221110
Ref: flawed-compat22135
Node: Multi-threaded tar24508
Node: Examples27012
Node: Problems28682
Node: Concept index29208
Node: Introduction1013
Node: Invoking tarlz3125
Ref: --data-size4717
Node: File format11536
Ref: key_crc3216321
Node: Amendments to pax format21738
Ref: crc3222262
Ref: flawed-compat23287
Node: Multi-threaded tar25649
Node: Examples28164
Node: Problems29830
Node: Concept index30356

End Tag Table

View file

@ -6,8 +6,8 @@
@finalout
@c %**end of header
@set UPDATED 22 January 2019
@set VERSION 0.9
@set UPDATED 31 January 2019
@set VERSION 0.10
@dircategory Data Compression
@direntry
@ -89,7 +89,7 @@ member) just like to an uncompressed tar archive.
It is a safe posix-style backup format. In case of corruption,
tarlz can extract all the undamaged members from the tar.lz
archive, skipping over the damaged members, just like the standard
(uncompressed) tar. Moreover, the option @code{--keep-damaged} can be
(uncompressed) tar. Moreover, the option @samp{--keep-damaged} can be
used to recover as much data as possible from each damaged member,
and lziprecover can be used to recover some of the damaged members.
@ -154,6 +154,13 @@ end-of-file blocks are removed as each new archive is concatenated. Exit
with status 0 without modifying the archive if no @var{files} have been
specified. Tarlz can't concatenate uncompressed tar archives.
@anchor{--data-size}
@item -B @var{bytes}
@itemx --data-size=@var{bytes}
Set target size of input data blocks for the @samp{--bsolid} option. Valid
values range from @w{8 KiB} to @w{1 GiB}. Default value is two times the
dictionary size, except for option @samp{-0} where it defaults to @w{1 MiB}.
@item -c
@itemx --create
Create a new archive from @var{files}.
@ -161,13 +168,13 @@ Create a new archive from @var{files}.
@item -C @var{dir}
@itemx --directory=@var{dir}
Change to directory @var{dir}. When creating or appending, the position
of each @code{-C} option in the command line is significant; it will
of each @samp{-C} option in the command line is significant; it will
change the current working directory for the following @var{files} until
a new @code{-C} option appears in the command line. When extracting, all
the @code{-C} options are executed in sequence before starting the
extraction. Listing ignores any @code{-C} options specified. @var{dir}
a new @samp{-C} option appears in the command line. When extracting, all
the @samp{-C} options are executed in sequence before starting the
extraction. Listing ignores any @samp{-C} options specified. @var{dir}
is relative to the then current working directory, perhaps changed by a
previous @code{-C} option.
previous @samp{-C} option.
@item -f @var{archive}
@itemx --file=@var{archive}
@ -222,6 +229,20 @@ Set the compression level. The default compression level is @samp{-6}.
Like lzip, tarlz also minimizes the dictionary size of the lzip members
it creates, reducing the amount of memory required for decompression.
@multitable {Level} {Dictionary size} {Match length limit}
@item Level @tab Dictionary size @tab Match length limit
@item -0 @tab 64 KiB @tab 16 bytes
@item -1 @tab 1 MiB @tab 5 bytes
@item -2 @tab 1.5 MiB @tab 6 bytes
@item -3 @tab 2 MiB @tab 8 bytes
@item -4 @tab 3 MiB @tab 12 bytes
@item -5 @tab 4 MiB @tab 20 bytes
@item -6 @tab 8 MiB @tab 36 bytes
@item -7 @tab 16 MiB @tab 68 bytes
@item -8 @tab 24 MiB @tab 132 bytes
@item -9 @tab 32 MiB @tab 273 bytes
@end multitable
@item --asolid
When creating or appending to a compressed archive, use appendable solid
compression. All the files being added to the archive are compressed
@ -229,6 +250,14 @@ into a single lzip member, but the end-of-file blocks are compressed
into a separate lzip member. This creates a solidly compressed
appendable archive.
@item --bsolid
When creating or appending to a compressed archive, compress tar members
together in a lzip member until they approximate a target uncompressed size.
The size can't be exact because each solidly compressed data block must
contain an integer number of tar members. This option improves compression
efficiency for archives with lots of small files. @xref{--data-size}, to set
the target block size.
@item --dsolid
When creating or appending to a compressed archive, use solid
compression for each directory especified in the command line. The
@ -252,7 +281,7 @@ resulting archive is not appendable. No more files can be later appended
to the archive.
@item --anonymous
Equivalent to @code{--owner=root --group=root}.
Equivalent to @samp{--owner=root --group=root}.
@item --owner=@var{owner}
When creating or appending, use @var{owner} for files added to the
@ -287,7 +316,7 @@ keyword appearing in the same block of extended records.
@end ignore
@item --uncompressed
With @code{--create}, don't compress the created tar archive. Create an
With @samp{--create}, don't compress the created tar archive. Create an
uncompressed tar archive instead.
@end table
@ -350,7 +379,7 @@ Zero or more blocks that contain the contents of the file.
@end itemize
Each tar member must be contiguously stored in a lzip member for the
parallel decoding operations like @code{--list} to work. If any tar member
parallel decoding operations like @samp{--list} to work. If any tar member
is split over two or more lzip members, the archive must be decoded
sequentially. @xref{Multi-threaded tar}.
@ -381,7 +410,7 @@ tar.lz
@end verbatim
@ignore
When @code{--permissive} is used, the following violations of the
When @samp{--permissive} is used, the following violations of the
archive format are allowed:@*
If several extended headers precede an ustar header, only the last
extended header takes effect. The other extended headers are ignored.
@ -623,13 +652,12 @@ programs from extracting the extended records as a file in the wrong place.
Tarlz also sets to zero those fields of the ustar header overridden by
extended records.
If the extended header is needed because of a file size larger than
@w{8 GiB}, the size field will be unable to contain the full size of the
file. Therefore the file may be partially extracted, and the tool will issue
a spurious warning about a corrupt header at the point where it thinks the
file ends. Setting to zero the overridden size in the ustar header at least
prevents the partial extraction and makes obvious that the file has been
truncated.
If an extended header is required for any reason (for example a file size
larger than @w{8 GiB} or a link name longer than 100 bytes), tarlz moves the
filename also to the extended header to prevent an ustar tool from trying to
extract the file or link. This also makes easier during parallel extraction
or listing the detection of a tar member split between two lzip members at
the boundary between the extended header and the ustar header.
@sp 1
@section As simple as possible (but not simpler)
@ -679,14 +707,14 @@ decoding it safely in parallel.
Tarlz is able to automatically decode aligned and unaligned multimember
tar.lz archives, keeping backwards compatibility. If tarlz finds a member
misalignment during multi-threaded decoding, it switches to single-threaded
mode and continues decoding the archive. Currently only the @code{--list}
mode and continues decoding the archive. Currently only the @samp{--list}
option is able to do multi-threaded decoding.
If the files in the archive are large, multi-threaded @code{--list} on a
regular tar.lz archive can be hundreds of times faster than sequential
@code{--list} because, in addition to using several processors, it only
needs to decompress part of each lzip member. See the following example
listing the Silesia corpus on a dual core machine:
If the files in the archive are large, multi-threaded @samp{--list} on a
regular (seekable) tar.lz archive can be hundreds of times faster than
sequential @samp{--list} because, in addition to using several processors,
it only needs to decompress part of each lzip member. See the following
example listing the Silesia corpus on a dual core machine:
@example
tarlz -9 -cf silesia.tar.lz silesia
@ -772,10 +800,10 @@ tarlz -xf archive.tar.lz a c
@sp 1
@noindent
Example 8: Copy the contents of directory @samp{sourcedir} to the
directory @samp{targetdir}.
directory @samp{destdir}.
@example
tarlz -C sourcedir -c . | tarlz -C targetdir -x
tarlz -C sourcedir -c . | tarlz -C destdir -x
@end example

156
extended.cc Normal file
View file

@ -0,0 +1,156 @@
/* Tarlz - Archiver with multimember lzip compression
Copyright (C) 2013-2019 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define _FILE_OFFSET_BITS 64
#include <cctype>
#include <climits>
#include <cstdlib>
#include <cstring>
#include <string>
#include <vector>
#include <stdint.h>
#include "tarlz.h"
namespace {
unsigned decimal_digits( unsigned long long value )
{
unsigned digits = 1;
while( value >= 10 ) { value /= 10; ++digits; }
return digits;
}
int record_size( const unsigned keyword_size, const unsigned long value_size )
{
// size = ' ' + keyword + '=' + value + '\n'
unsigned long long size = 1 + keyword_size + 1 + value_size + 1;
const unsigned d1 = decimal_digits( size );
size += decimal_digits( d1 + size );
if( size >= INT_MAX ) size = 0; // overflows snprintf size
return size;
}
unsigned long long parse_decimal( const char * const ptr,
const char ** const tailp,
const unsigned long long size )
{
unsigned long long result = 0;
unsigned long long i = 0;
while( i < size && std::isspace( ptr[i] ) ) ++i;
if( !std::isdigit( (unsigned char)ptr[i] ) )
{ if( tailp ) *tailp = ptr; return 0; }
for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i )
{
const unsigned long long prev = result;
result *= 10; result += ptr[i] - '0';
if( result < prev || result > LLONG_MAX ) // overflow
{ if( tailp ) *tailp = ptr; return 0; }
}
if( tailp ) *tailp = ptr + i;
return result;
}
uint32_t parse_record_crc( const char * const ptr )
{
uint32_t crc = 0;
for( int i = 0; i < 8; ++i )
{
crc <<= 4;
if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0';
else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A';
else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a';
else { crc = 0; break; } // invalid digit in crc string
}
return crc;
}
} // end namespace
const std::string Extended::crc_record( "22 GNU.crc32=00000000\n" );
int Extended::recsize_linkpath() const
{
if( recsize_linkpath_ < 0 ) recsize_linkpath_ =
linkpath_.size() ? record_size( 8, linkpath_.size() ) : 0;
return recsize_linkpath_;
}
int Extended::recsize_path() const
{
if( recsize_path_ < 0 )
recsize_path_ = path_.size() ? record_size( 4, path_.size() ) : 0;
return recsize_path_;
}
int Extended::recsize_file_size() const
{
if( recsize_file_size_ < 0 ) recsize_file_size_ =
( file_size_ > 0 ) ? record_size( 4, file_size_ ) : 0;
return recsize_file_size_;
}
bool Extended::parse( const char * const buf, const unsigned long long edsize,
const bool permissive )
{
reset();
for( unsigned long long pos = 0; pos < edsize; ) // parse records
{
const char * tail;
const unsigned long long rsize =
parse_decimal( buf + pos, &tail, edsize - pos );
if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' ||
buf[pos+rsize-1] != '\n' ) return false;
++tail; // point to keyword
// rest = length of (keyword + '=' + value) without the final newline
const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail;
if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 )
{ if( path_.size() && !permissive ) return false;
path_.assign( tail + 5, rest - 5 ); }
else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 )
{ if( linkpath_.size() && !permissive ) return false;
linkpath_.assign( tail + 9, rest - 9 ); }
else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 )
{
if( file_size_ != 0 && !permissive ) return false;
file_size_ = parse_decimal( tail + 5, &tail, rest - 5 );
// parse error or size fits in ustar header
if( file_size_ < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) )
return false;
}
else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 )
{
if( crc_present_ && !permissive ) return false;
if( rsize != crc_record.size() ) return false;
const uint32_t stored_crc = parse_record_crc( tail + 10 );
const uint32_t computed_crc =
crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize );
crc_present_ = true;
if( stored_crc != computed_crc ) return false;
}
pos += rsize;
}
full_size_ = header_size + round_up( edsize );
return true;
}

View file

@ -37,7 +37,6 @@
#include <lzlib.h>
#include "arg_parser.h"
#include "lzip.h"
#include "lzip_index.h"
#include "tarlz.h"
@ -268,19 +267,19 @@ void format_member_name( const Extended & extended, const Tar_header header,
for( int i = 0; i < 2; ++i )
{
const int len = snprintf( rbuf() + offset, rbuf.size() - offset,
" %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n",
extended.size, 1900 + tm->tm_year, 1 + tm->tm_mon,
tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path.c_str(),
link_string, !islink ? "" : extended.linkpath.c_str() );
" %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n",
extended.file_size(), 1900 + tm->tm_year, 1 + tm->tm_mon,
tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path().c_str(),
link_string, !islink ? "" : extended.linkpath().c_str() );
if( (int)rbuf.size() > len + offset ) break;
else rbuf.resize( len + offset + 1 );
}
}
else
{
if( rbuf.size() < extended.path.size() + 2 )
rbuf.resize( extended.path.size() + 2 );
snprintf( rbuf(), rbuf.size(), "%s\n", extended.path.c_str() );
if( rbuf.size() < extended.path().size() + 2 )
rbuf.resize( extended.path().size() + 2 );
snprintf( rbuf(), rbuf.size(), "%s\n", extended.path().c_str() );
}
}
@ -303,8 +302,8 @@ int list_member( const int infd, const Extended & extended,
const unsigned bufsize = 32 * header_size;
uint8_t buf[bufsize];
unsigned long long rest = extended.size;
const int rem = extended.size % header_size;
unsigned long long rest = extended.file_size();
const int rem = rest % header_size;
const int padding = rem ? header_size - rem : 0;
while( rest > 0 )
{
@ -331,7 +330,7 @@ bool contains_dotdot( const char * const filename )
int extract_member( const int infd, const Extended & extended,
const Tar_header header, const bool keep_damaged )
{
const char * const filename = extended.path.c_str();
const char * const filename = extended.path().c_str();
if( contains_dotdot( filename ) )
{
show_file_error( filename, "Contains a '..' component, skipping." );
@ -357,7 +356,7 @@ int extract_member( const int infd, const Extended & extended,
case tf_link:
case tf_symlink:
{
const char * const linkname = extended.linkpath.c_str();
const char * const linkname = extended.linkpath().c_str();
/* if( contains_dotdot( linkname ) )
{
show_file_error( filename,
@ -421,8 +420,8 @@ int extract_member( const int infd, const Extended & extended,
const unsigned bufsize = 32 * header_size;
uint8_t buf[bufsize];
unsigned long long rest = extended.size;
const int rem = extended.size % header_size;
unsigned long long rest = extended.file_size();
const int rem = rest % header_size;
const int padding = rem ? header_size - rem : 0;
while( rest > 0 )
{
@ -501,42 +500,6 @@ bool compare_tslash( const char * const name1, const char * const name2 )
namespace {
unsigned long long parse_decimal( const char * const ptr,
const char ** const tailp,
const unsigned long long size )
{
unsigned long long result = 0;
unsigned long long i = 0;
while( i < size && std::isspace( ptr[i] ) ) ++i;
if( !std::isdigit( (unsigned char)ptr[i] ) )
{ if( tailp ) *tailp = ptr; return 0; }
for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i )
{
const unsigned long long prev = result;
result *= 10; result += ptr[i] - '0';
if( result < prev || result > LLONG_MAX ) // overflow
{ if( tailp ) *tailp = ptr; return 0; }
}
if( tailp ) *tailp = ptr + i;
return result;
}
uint32_t parse_record_crc( const char * const ptr )
{
uint32_t crc = 0;
for( int i = 0; i < 8; ++i )
{
crc <<= 4;
if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0';
else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A';
else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a';
else { crc = 0; break; } // invalid digit in crc string
}
return crc;
}
bool parse_records( const int infd, Extended & extended,
const Tar_header header, const bool permissive )
{
@ -602,48 +565,6 @@ unsigned long long parse_octal( const uint8_t * const ptr, const int size )
}
bool Extended::parse( const char * const buf, const unsigned long long edsize,
const bool permissive )
{
for( unsigned long long pos = 0; pos < edsize; ) // parse records
{
const char * tail;
const unsigned long long rsize =
parse_decimal( buf + pos, &tail, edsize - pos );
if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' ||
buf[pos+rsize-1] != '\n' ) return false;
++tail; // point to keyword
// rest = length of (keyword + '=' + value) without the final newline
const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail;
if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 )
{ if( path.size() && !permissive ) return false;
path.assign( tail + 5, rest - 5 ); }
else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 )
{ if( linkpath.size() && !permissive ) return false;
linkpath.assign( tail + 9, rest - 9 ); }
else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 )
{
if( size != 0 && !permissive ) return false;
size = parse_decimal( tail + 5, &tail, rest - 5 );
// parse error or size fits in ustar header
if( size < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) ) return false;
}
else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 )
{
if( crc_present && !permissive ) return false;
if( rsize != 22 ) return false;
const uint32_t stored_crc = parse_record_crc( tail + 10 );
const uint32_t computed_crc =
crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize );
crc_present = true;
if( stored_crc != computed_crc ) return false;
}
pos += rsize;
}
return true;
}
int decode( const std::string & archive_name, const Arg_parser & parser,
const int filenames, const int num_workers, const int debug_level,
const bool keep_damaged, const bool listing, const bool missing_crc,
@ -722,23 +643,27 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
if( !parse_records( infd, extended, header, permissive ) )
{ show_error( "Error in extended records. Skipping to next header." );
extended.reset(); gretval = 2; }
else if( !extended.crc_present && missing_crc )
else if( !extended.crc_present() && missing_crc )
{ show_error( "Missing CRC in extended records.", 0, true ); return 2; }
prev_extended = true;
continue;
}
prev_extended = false;
if( extended.linkpath.empty() ) // copy linkpath from ustar header
if( extended.linkpath().empty() ) // copy linkpath from ustar header
{
for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i )
extended.linkpath += header[linkname_o+i];
while( extended.linkpath.size() > 1 && // trailing '/'
extended.linkpath[extended.linkpath.size()-1] == '/' )
extended.linkpath.resize( extended.linkpath.size() - 1 );
int len = 0;
while( len < linkname_l && header[linkname_o+len] ) ++len;
while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/'
if( len > 0 )
{
const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0;
extended.linkpath( (const char *)header + linkname_o );
header[linkname_o+len] = c;
}
}
if( extended.path.empty() ) // copy path from ustar header
if( extended.path().empty() ) // copy path from ustar header
{
char stored_name[prefix_l+1+name_l+1];
int len = 0;
@ -749,9 +674,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
{ stored_name[len] = header[name_o+i]; ++len; }
while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/'
stored_name[len] = 0;
extended.path = remove_leading_slash( stored_name );
extended.path( remove_leading_slash( stored_name ) );
}
const char * const filename = extended.path.c_str();
const char * const filename = extended.path().c_str();
bool skip = filenames > 0;
if( skip )
@ -765,9 +690,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
{ skip = false; name_pending[i] = false; break; }
}
if( extended.size == 0 &&
if( extended.file_size() == 0 &&
( typeflag == tf_regular || typeflag == tf_hiperf ) )
extended.size = parse_octal( header + size_o, size_l );
extended.file_size( parse_octal( header + size_o, size_l ) );
if( listing || skip )
retval = list_member( infd, extended, header, skip );

View file

@ -32,7 +32,6 @@
#include <lzlib.h>
#include "arg_parser.h"
#include "lzip.h"
#include "lzip_index.h"
#include "tarlz.h"
@ -355,8 +354,8 @@ int list_member_lz( LZ_Decoder * const decoder, const int infd,
Resizable_buffer & rbuf, const long member_id,
const int worker_id, const char ** msg, const bool skip )
{
unsigned long long rest = extended.size;
const int rem = extended.size % header_size;
unsigned long long rest = extended.file_size();
const int rem = rest % header_size;
const int padding = rem ? header_size - rem : 0;
const long long data_rest = mdata_end - ( data_pos + rest + padding );
bool master = false;
@ -527,7 +526,7 @@ extern "C" void * dworker_l( void * arg )
ret = 2; }
else ret = parse_records_lz( decoder, infd, file_pos, member_end,
cdata_size, data_pos, extended, header, &msg, permissive );
if( ret == 0 && !extended.crc_present && missing_crc )
if( ret == 0 && !extended.crc_present() && missing_crc )
{ msg = "Missing CRC in extended records."; ret = 2; }
if( ret != 0 )
{
@ -549,16 +548,20 @@ extern "C" void * dworker_l( void * arg )
}
prev_extended = false;
if( extended.linkpath.empty() ) // copy linkpath from ustar header
if( extended.linkpath().empty() ) // copy linkpath from ustar header
{
for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i )
extended.linkpath += header[linkname_o+i];
while( extended.linkpath.size() > 1 && // trailing '/'
extended.linkpath[extended.linkpath.size()-1] == '/' )
extended.linkpath.resize( extended.linkpath.size() - 1 );
int len = 0;
while( len < linkname_l && header[linkname_o+len] ) ++len;
while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/'
if( len > 0 )
{
const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0;
extended.linkpath( (const char *)header + linkname_o );
header[linkname_o+len] = c;
}
}
if( extended.path.empty() ) // copy path from ustar header
if( extended.path().empty() ) // copy path from ustar header
{
char stored_name[prefix_l+1+name_l+1];
int len = 0;
@ -569,9 +572,9 @@ extern "C" void * dworker_l( void * arg )
{ stored_name[len] = header[name_o+i]; ++len; }
while( len > 0 && stored_name[len-1] == '/' ) --len; // trailing '/'
stored_name[len] = 0;
extended.path = remove_leading_slash( stored_name );
extended.path( remove_leading_slash( stored_name ) );
}
const char * const filename = extended.path.c_str();
const char * const filename = extended.path().c_str();
bool skip = filenames > 0;
if( skip )
@ -585,9 +588,9 @@ extern "C" void * dworker_l( void * arg )
{ skip = false; name_pending[i] = false; break; }
}
if( extended.size == 0 &&
if( extended.file_size() == 0 &&
( typeflag == tf_regular || typeflag == tf_hiperf ) )
extended.size = parse_octal( header + size_o, size_l );
extended.file_size( parse_octal( header + size_o, size_l ) );
retval = list_member_lz( decoder, infd, file_pos, member_end,
cdata_size, data_pos, mdata_end, courier,
@ -643,7 +646,7 @@ int list_lz( const Arg_parser & parser, std::vector< char > & name_pending,
const int debug_level, const int infd, const int num_workers,
const bool missing_crc, const bool permissive )
{
const int out_slots = 100;
const int out_slots = 65536; // max small files (<=512B) in 64 MiB
Packet_courier courier( num_workers, out_slots );
Worker_arg * worker_args = new( std::nothrow ) Worker_arg[num_workers];

146
lzip.h
View file

@ -1,146 +0,0 @@
/* Tarlz - Archiver with multimember lzip compression
Copyright (C) 2013-2019 Antonio Diaz Diaz.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef LZ_API_VERSION
#define LZ_API_VERSION 1
#endif
enum {
min_dictionary_bits = 12,
min_dictionary_size = 1 << min_dictionary_bits,
max_dictionary_bits = 29,
max_dictionary_size = 1 << max_dictionary_bits,
min_member_size = 36 };
class CRC32
{
uint32_t data[256]; // Table of CRCs of all 8-bit messages.
public:
CRC32()
{
for( unsigned n = 0; n < 256; ++n )
{
unsigned c = n;
for( int k = 0; k < 8; ++k )
{ if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
data[n] = c;
}
}
void update_byte( uint32_t & crc, const uint8_t byte ) const
{ crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); }
};
inline bool isvalid_ds( const unsigned dictionary_size )
{ return ( dictionary_size >= min_dictionary_size &&
dictionary_size <= max_dictionary_size ); }
const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP"
struct Lzip_header
{
uint8_t data[6]; // 0-3 magic bytes
// 4 version
// 5 coded_dict_size
enum { size = 6 };
bool verify_magic() const
{ return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); }
bool verify_prefix( const int sz ) const // detect (truncated) header
{
for( int i = 0; i < sz && i < 4; ++i )
if( data[i] != lzip_magic[i] ) return false;
return ( sz > 0 );
}
bool verify_corrupt() const // detect corrupt header
{
int matches = 0;
for( int i = 0; i < 4; ++i )
if( data[i] == lzip_magic[i] ) ++matches;
return ( matches > 1 && matches < 4 );
}
uint8_t version() const { return data[4]; }
bool verify_version() const { return ( data[4] == 1 ); }
unsigned dictionary_size() const
{
unsigned sz = ( 1 << ( data[5] & 0x1F ) );
if( sz > min_dictionary_size )
sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 );
return sz;
}
};
struct Lzip_trailer
{
uint8_t data[20]; // 0-3 CRC32 of the uncompressed data
// 4-11 size of the uncompressed data
// 12-19 member size including header and trailer
enum { size = 20 };
unsigned data_crc() const
{
unsigned tmp = 0;
for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; }
return tmp;
}
unsigned long long data_size() const
{
unsigned long long tmp = 0;
for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; }
return tmp;
}
unsigned long long member_size() const
{
unsigned long long tmp = 0;
for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; }
return tmp;
}
bool verify_consistency() const // check internal consistency
{
const unsigned crc = data_crc();
const unsigned long long dsize = data_size();
if( ( crc == 0 ) != ( dsize == 0 ) ) return false;
const unsigned long long msize = member_size();
if( msize < min_member_size ) return false;
const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size;
if( mlimit > dsize && msize > mlimit ) return false;
const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1;
if( dlimit > msize && dsize > dlimit ) return false;
return true;
}
};
const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
const char * const bad_dict_msg = "Invalid dictionary size in member header.";
const char * const corrupt_mm_msg = "Corrupt header in multimember file.";
const char * const trailing_msg = "Trailing data not allowed.";
// defined in extract.cc
int readblock( const int fd, uint8_t * const buf, const int size );
int writeblock( const int fd, const uint8_t * const buf, const int size );

View file

@ -26,8 +26,8 @@
#include <stdint.h>
#include <unistd.h>
#include "lzip.h"
#include "lzip_index.h"
#include "tarlz.h"
namespace {

11
main.cc
View file

@ -87,6 +87,7 @@ void show_help( const long num_online )
" -h, --help display this help and exit\n"
" -V, --version output version information and exit\n"
" -A, --concatenate append tar.lz archives to the end of an archive\n"
" -B, --data-size=<bytes> set target size of input data blocks [2x8=16 MiB]\n"
" -c, --create create a new archive\n"
" -C, --directory=<dir> change to directory <dir>\n"
" -f, --file=<archive> use archive file <archive>\n"
@ -98,6 +99,7 @@ void show_help( const long num_online )
" -x, --extract extract files from an archive\n"
" -0 .. -9 set compression level [default 6]\n"
" --asolid create solidly compressed appendable archive\n"
" --bsolid create per-data-block compressed archive\n"
" --dsolid create per-directory compressed archive\n"
" --no-solid create per-file compressed archive (default)\n"
" --solid create solidly compressed archive\n"
@ -284,8 +286,8 @@ int main( const int argc, const char * const argv[] )
{ show_error( "Bad library version. At least lzlib 1.0 is required." );
return 1; }
enum { opt_ano = 256, opt_aso, opt_crc, opt_dbg, opt_dso, opt_grp, opt_kd,
opt_nso, opt_own, opt_per, opt_sol, opt_un };
enum { opt_ano = 256, opt_aso, opt_bso, opt_crc, opt_dbg, opt_dso, opt_grp,
opt_kd, opt_nso, opt_own, opt_per, opt_sol, opt_un };
const Arg_parser::Option options[] =
{
{ '0', 0, Arg_parser::no },
@ -299,6 +301,7 @@ int main( const int argc, const char * const argv[] )
{ '8', 0, Arg_parser::no },
{ '9', 0, Arg_parser::no },
{ 'A', "concatenate", Arg_parser::no },
{ 'B', "data-size", Arg_parser::yes },
{ 'c', "create", Arg_parser::no },
{ 'C', "directory", Arg_parser::yes },
{ 'f', "file", Arg_parser::yes },
@ -313,6 +316,7 @@ int main( const int argc, const char * const argv[] )
{ 'x', "extract", Arg_parser::no },
{ opt_ano, "anonymous", Arg_parser::no },
{ opt_aso, "asolid", Arg_parser::no },
{ opt_bso, "bsolid", Arg_parser::no },
{ opt_dbg, "debug", Arg_parser::yes },
{ opt_dso, "dsolid", Arg_parser::no },
{ opt_grp, "group", Arg_parser::yes },
@ -347,6 +351,8 @@ int main( const int argc, const char * const argv[] )
case '5': case '6': case '7': case '8': case '9':
level = code - '0'; break;
case 'A': set_mode( program_mode, m_concatenate ); break;
case 'B': cl_data_size = getnum( arg, min_data_size, max_data_size );
break;
case 'c': set_mode( program_mode, m_create ); break;
case 'C': break; // skip chdir
case 'f': if( sarg != "-" ) archive_name = sarg; break;
@ -361,6 +367,7 @@ int main( const int argc, const char * const argv[] )
case 'x': set_mode( program_mode, m_extract ); break;
case opt_ano: set_owner( "root" ); set_group( "root" ); break;
case opt_aso: solidity = asolid; break;
case opt_bso: solidity = bsolid; break;
case opt_crc: missing_crc = true; break;
case opt_dbg: debug_level = getnum( arg, 0, 3 ); break;
case opt_dso: solidity = dsolid; break;

216
tarlz.h
View file

@ -42,22 +42,195 @@ inline bool verify_ustar_magic( const uint8_t * const header )
{ return std::memcmp( header + magic_o, ustar_magic, magic_l ) == 0; }
class CRC32C // Uses CRC32-C (Castagnoli) polynomial.
// Round "size" to the next multiple of header size (512).
//
inline unsigned long long round_up( const unsigned long long size )
{
const int rem = size % header_size;
const int padding = rem ? header_size - rem : 0;
return size + padding;
}
class Extended // stores metadata from/for extended records
{
std::string linkpath_;
std::string path_;
unsigned long long file_size_;
mutable long long full_size_; // cached sizes
mutable int recsize_linkpath_;
mutable int recsize_path_;
mutable int recsize_file_size_;
bool crc_present_; // true if CRC present in parsed records
public:
static const std::string crc_record;
Extended()
: file_size_( 0 ), full_size_( -1 ), recsize_linkpath_( -1 ),
recsize_path_( -1 ), recsize_file_size_( -1 ), crc_present_( false ) {}
void reset()
{ linkpath_.clear(); path_.clear(); file_size_ = 0; full_size_ = -1;
recsize_linkpath_ = -1; recsize_path_ = -1; recsize_file_size_ = -1;
crc_present_ = false; }
bool empty() const
{ return linkpath_.empty() && path_.empty() && file_size_ == 0; }
const std::string & linkpath() const { return linkpath_; }
const std::string & path() const { return path_; }
unsigned long long file_size() const { return file_size_; }
void linkpath( const char * const lp )
{ linkpath_ = lp; full_size_ = -1; recsize_linkpath_ = -1; }
void path( const char * const p )
{ path_ = p; full_size_ = -1; recsize_path_ = -1; }
void file_size( const unsigned long long fs )
{ file_size_ = fs; full_size_ = -1; recsize_file_size_ = -1; }
int recsize_linkpath() const;
int recsize_path() const;
int recsize_file_size() const;
unsigned long long edsize() const // extended data size
{ return empty() ? 0 : recsize_linkpath() + recsize_path() +
recsize_file_size() + crc_record.size(); }
unsigned long long edsize_pad() const // edsize rounded up
{ return round_up( edsize() ); }
unsigned long long full_size() const
{ if( full_size_ < 0 )
full_size_ = ( empty() ? 0 : header_size + edsize_pad() );
return full_size_; }
bool crc_present() const { return crc_present_; }
bool parse( const char * const buf, const unsigned long long edsize,
const bool permissive );
};
enum {
min_dictionary_bits = 12,
min_dictionary_size = 1 << min_dictionary_bits,
max_dictionary_bits = 29,
max_dictionary_size = 1 << max_dictionary_bits,
min_member_size = 36,
min_data_size = 2 * min_dictionary_size,
max_data_size = 2 * max_dictionary_size };
inline bool isvalid_ds( const unsigned dictionary_size )
{ return ( dictionary_size >= min_dictionary_size &&
dictionary_size <= max_dictionary_size ); }
const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; // "LZIP"
struct Lzip_header
{
uint8_t data[6]; // 0-3 magic bytes
// 4 version
// 5 coded_dict_size
enum { size = 6 };
bool verify_magic() const
{ return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); }
bool verify_prefix( const int sz ) const // detect (truncated) header
{
for( int i = 0; i < sz && i < 4; ++i )
if( data[i] != lzip_magic[i] ) return false;
return ( sz > 0 );
}
bool verify_corrupt() const // detect corrupt header
{
int matches = 0;
for( int i = 0; i < 4; ++i )
if( data[i] == lzip_magic[i] ) ++matches;
return ( matches > 1 && matches < 4 );
}
uint8_t version() const { return data[4]; }
bool verify_version() const { return ( data[4] == 1 ); }
unsigned dictionary_size() const
{
unsigned sz = ( 1 << ( data[5] & 0x1F ) );
if( sz > min_dictionary_size )
sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 );
return sz;
}
};
struct Lzip_trailer
{
uint8_t data[20]; // 0-3 CRC32 of the uncompressed data
// 4-11 size of the uncompressed data
// 12-19 member size including header and trailer
enum { size = 20 };
unsigned data_crc() const
{
unsigned tmp = 0;
for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; }
return tmp;
}
unsigned long long data_size() const
{
unsigned long long tmp = 0;
for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; }
return tmp;
}
unsigned long long member_size() const
{
unsigned long long tmp = 0;
for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; }
return tmp;
}
bool verify_consistency() const // check internal consistency
{
const unsigned crc = data_crc();
const unsigned long long dsize = data_size();
if( ( crc == 0 ) != ( dsize == 0 ) ) return false;
const unsigned long long msize = member_size();
if( msize < min_member_size ) return false;
const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size;
if( mlimit > dsize && msize > mlimit ) return false;
const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1;
if( dlimit > msize && dsize > dlimit ) return false;
return true;
}
};
class CRC32
{
uint32_t data[256]; // Table of CRCs of all 8-bit messages.
public:
CRC32C()
CRC32( const bool castagnoli = false )
{
const unsigned cpol = 0x82F63B78U; // CRC32-C Castagnoli polynomial.
const unsigned ipol = 0xEDB88320U; // IEEE 802.3 Ethernet polynomial.
const unsigned poly = castagnoli ? cpol : ipol;
for( unsigned n = 0; n < 256; ++n )
{
unsigned c = n;
for( int k = 0; k < 8; ++k )
{ if( c & 1 ) c = 0x82F63B78U ^ ( c >> 1 ); else c >>= 1; }
{ if( c & 1 ) c = poly ^ ( c >> 1 ); else c >>= 1; }
data[n] = c;
}
}
void update_byte( uint32_t & crc, const uint8_t byte ) const
{ crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); }
void update_buf( uint32_t & crc, const uint8_t * const buffer,
const int size ) const
{
@ -78,32 +251,7 @@ public:
}
};
extern const CRC32C crc32c;
// Round "size" to the next multiple of header size (512).
//
inline unsigned long long round_up( unsigned long long size )
{
const int rem = size % header_size;
const int padding = rem ? header_size - rem : 0;
return size + padding;
}
struct Extended // stores metadata from/for extended records
{
std::string linkpath;
std::string path;
unsigned long long size;
bool crc_present;
Extended() : size( 0 ), crc_present( false ) {}
void reset()
{ linkpath.clear(); path.clear(); size = 0; crc_present = false; }
bool empty() { return linkpath.empty() && path.empty() && size == 0; }
bool parse( const char * const buf, const unsigned long long edsize,
const bool permissive );
};
extern const CRC32 crc32c;
enum { initial_line_length = 1000 }; // must be >= 77
@ -132,10 +280,16 @@ public:
unsigned size() const { return size_; }
};
const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
const char * const bad_dict_msg = "Invalid dictionary size in member header.";
const char * const corrupt_mm_msg = "Corrupt header in multimember file.";
const char * const trailing_msg = "Trailing data not allowed.";
// defined in create.cc
enum Solidity { no_solid, dsolid, asolid, solid };
enum Solidity { no_solid, bsolid, dsolid, asolid, solid };
extern int cl_owner;
extern int cl_group;
extern int cl_data_size;
extern Solidity solidity;
unsigned ustar_chksum( const uint8_t * const header );
bool verify_ustar_chksum( const uint8_t * const header );
@ -152,6 +306,8 @@ void format_member_name( const Extended & extended, const Tar_header header,
const char * remove_leading_slash( const char * const filename );
bool compare_prefix_dir( const char * const dir, const char * const name );
bool compare_tslash( const char * const name1, const char * const name2 );
int readblock( const int fd, uint8_t * const buf, const int size );
int writeblock( const int fd, const uint8_t * const buf, const int size );
unsigned long long parse_octal( const uint8_t * const ptr, const int size );
int decode( const std::string & archive_name, const Arg_parser & parser,
const int filenames, const int num_workers, const int debug_level,

View file

@ -65,7 +65,7 @@ lzlib_1_11() { [ ${lwarn} = 0 ] &&
# Description of test files for tarlz:
# test.txt.tar.lz: 1 member (test.txt).
# t155.tar[.lz]: directory + file + link + eof, all with 155 char names
# t155.tar[.lz]: directory + links + file + eof, all with 155 char names
# tar_in_tlz1.tar.lz 2 members (test.txt.tar test3.tar) 3 lzip members
# tar_in_tlz2.tar.lz 2 members (test.txt.tar test3.tar) 5 lzip members
# test_bad1.tar.lz: truncated at offset 6000 (of 7495)
@ -163,10 +163,11 @@ rm -f test.txt || framework_failure
"${TARLZ}" -xf "${in_tar}" --missing-crc || test_failed $LINENO
cmp "${in}" test.txt || test_failed $LINENO
rm -f test.txt || framework_failure
#
printf "foo\n" > cfoo || framework_failure
printf "bar\n" > cbar || framework_failure
printf "baz\n" > cbaz || framework_failure
# reference files for cmp
cat "${testdir}"/rfoo > cfoo || framework_failure
cat "${testdir}"/rbar > cbar || framework_failure
cat "${testdir}"/rbaz > cbaz || framework_failure
rm -f foo bar baz || framework_failure
"${TARLZ}" -xf "${test3_lz}" --missing-crc || test_failed $LINENO
cmp cfoo foo || test_failed $LINENO
@ -261,7 +262,7 @@ for i in "${tarint1_lz}" "${tarint2_lz}" ; do
cmp out0 out6 || test_failed $LINENO
cmp out2 out6 || test_failed $LINENO
cmp outv0 outv2 || test_failed $LINENO
cmp outv0 outv2 || test_failed $LINENO
cmp outv0 outv6 || test_failed $LINENO
cmp outv2 outv6 || test_failed $LINENO
rm -f out0 out2 out6 outv0 outv2 outv6 || framework_failure
"${TARLZ}" -xf "$i" || test_failed $LINENO
@ -409,14 +410,14 @@ cat cbar > bar || framework_failure
cat cbaz > baz || framework_failure
"${TARLZ}" --solid -0 -cf out.tar.lz foo || test_failed $LINENO
cat out.tar.lz > aout.tar.lz || framework_failure
for i in --asolid --dsolid --solid -0 ; do
for i in --asolid --bsolid --dsolid --solid -0 ; do
"${TARLZ}" $i -q -rf out.tar.lz bar baz
[ $? = 2 ] || test_failed $LINENO $i
cmp out.tar.lz aout.tar.lz || test_failed $LINENO $i
done
rm -f out.tar.lz aout.tar.lz || framework_failure
for i in --asolid --dsolid -0 ; do
for j in --asolid --dsolid --solid -0 ; do
for i in --asolid --bsolid --dsolid -0 ; do
for j in --asolid --bsolid --dsolid --solid -0 ; do
"${TARLZ}" $i -0 -cf out.tar.lz foo ||
test_failed $LINENO "$i $j"
"${TARLZ}" $j -0 -rf out.tar.lz bar baz ||

1
testsuite/rbar Normal file
View file

@ -0,0 +1 @@
bar

1
testsuite/rbaz Normal file
View file

@ -0,0 +1 @@
baz

1
testsuite/rfoo Normal file
View file

@ -0,0 +1 @@
foo

Binary file not shown.

Binary file not shown.