Merging upstream version 0.10.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-17 21:11:12 +01:00 · 2025-02-17 21:11:12 +01:00 · 060c1457b6
commit 060c1457b6
parent e076fdd679
21 changed files with 633 additions and 443 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+2019-01-31  Antonio Diaz Diaz  <antonio@gnu.org>
+
+	* Version 0.10 released.
+	* Added new option '--bsolid'.
+	* Added new option '-B, --data-size'.
+	* create.cc: Set ustar name to zero if extended header is used.
+
 2019-01-22  Antonio Diaz Diaz  <antonio@gnu.org>

 	* Version 0.9 released.
--- a/Makefile.in
+++ b/Makefile.in
@ -8,7 +8,7 @@ LIBS = -llz -lpthread
 SHELL = /bin/sh
 CAN_RUN_INSTALLINFO = $(SHELL) -c "install-info --version" > /dev/null 2>&1

-objs = arg_parser.o lzip_index.o create.o extract.o list_lz.o main.o
+objs = arg_parser.o lzip_index.o create.o extended.o extract.o list_lz.o main.o


 .PHONY : all install install-bin install-info install-man \
@ -30,10 +30,11 @@ main.o : main.cc

 $(objs)      : Makefile
 arg_parser.o : arg_parser.h
-create.o     : arg_parser.h lzip.h tarlz.h
-extract.o    : arg_parser.h lzip.h lzip_index.h tarlz.h
-list_lz.o    : arg_parser.h lzip.h lzip_index.h tarlz.h
-lzip_index.o : lzip.h lzip_index.h
+create.o     : arg_parser.h tarlz.h
+extended.o   : tarlz.h
+extract.o    : arg_parser.h lzip_index.h tarlz.h
+list_lz.o    : arg_parser.h lzip_index.h tarlz.h
+lzip_index.o : lzip_index.h tarlz.h
 main.o       : arg_parser.h tarlz.h


@ -123,6 +124,9 @@ dist : doc
 	  $(DISTNAME)/testsuite/test_bad1.txt.tar \
 	  $(DISTNAME)/testsuite/test_bad[12].txt \
 	  $(DISTNAME)/testsuite/t155.tar \
+	  $(DISTNAME)/testsuite/rfoo \
+	  $(DISTNAME)/testsuite/rbar \
+	  $(DISTNAME)/testsuite/rbaz \
 	  $(DISTNAME)/testsuite/test3.tar \
 	  $(DISTNAME)/testsuite/test3_bad[1-5].tar \
 	  $(DISTNAME)/testsuite/test.txt.lz \
--- a/25
+++ b/25
@ -1,16 +1,15 @@
-Changes in version 0.9:
+Changes in version 0.10:

-Multi-threaded '-t, --list' has been implemented. See chapter 'Limitations
-of parallel tar decoding' in the manual for details.
+The new option '--bsolid', which selects per-data-block compression of the
+archive, has been added. This option improves compression efficiency for
+archives with lots of small files.

-The new option '-n, --threads', which sets the number of decompression
-threads, has been added.
+The new option '-B, --data-size', which sets the size of the input data
+blocks for '--bsolid', has been added.

-Tarlz now recognizes global pax headers, but for now ignores them.
-
-Tarlz now decodes numerical fields in headers using length-safe parsers
-instead of strtoul to prevent the parser from exceeding the end of the field
-if it does not contain a terminating character.
-
-The new chapter 'Limitations of parallel tar decoding' has been added to the
-manual.
+If an extended header is required for any reason (for example a file size
+larger than 8 GiB or a link name longer than 100 bytes), tarlz now moves the
+filename also to the extended header to prevent an ustar tool from trying to
+extract the file or link. This also makes easier during parallel extraction
+or listing the detection of a tar member split between two lzip members at
+the boundary between the extended header and the ustar header.
--- a/2
+++ b/2
@ -6,7 +6,7 @@
 # to copy, distribute and modify it.

 pkgname=tarlz
-pkgversion=0.9
+pkgversion=0.10
 progname=tarlz
 srctrigger=doc/${pkgname}.texi

--- a/create.cc
+++ b/create.cc
@ -38,20 +38,21 @@
 #include <lzlib.h>

 #include "arg_parser.h"
-#include "lzip.h"
 #include "tarlz.h"


-const CRC32C crc32c;
+const CRC32 crc32c( true );

 int cl_owner = -1;		// global vars needed by add_member
 int cl_group = -1;
+int cl_data_size = 0;
 Solidity solidity = no_solid;

 namespace {

 LZ_Encoder * encoder = 0;	// local vars needed by add_member
 const char * archive_namep = 0;
+unsigned long long partial_data_size = 0;	// current block size
 int outfd = -1;
 int gretval = 0;

@ -150,17 +151,18 @@ bool check_appendable( const int fd, const bool remove_eof )
  }


-class File_is_archive
+class File_is_the_archive
  {
  dev_t archive_dev;
  ino_t archive_ino;
  bool initialized;
+
 public:
-  File_is_archive() : initialized( false ) {}
-  bool init()
+  File_is_the_archive() : initialized( false ) {}
+  bool init( const int fd )
    {
    struct stat st;
-    if( fstat( outfd, &st ) != 0 ) return false;
+    if( fstat( fd, &st ) != 0 ) return false;
    if( S_ISREG( st.st_mode ) )
      { archive_dev = st.st_dev; archive_ino = st.st_ino; initialized = true; }
    return true;
@ -169,7 +171,7 @@ public:
    {
    return initialized && archive_dev == st.st_dev && archive_ino == st.st_ino;
    }
-  } file_is_archive;
+  } file_is_the_archive;


 bool archive_write( const uint8_t * const buf, const int size )
@ -223,50 +225,32 @@ void print_octal( uint8_t * const buf, int size, unsigned long long num )
  while( --size >= 0 ) { buf[size] = '0' + ( num % 8 ); num /= 8; }
  }

-unsigned decimal_digits( unsigned long long value )
-  {
-  unsigned digits = 1;
-  while( value >= 10 ) { value /= 10; ++digits; }
-  return digits;
-  }
-
-int record_size( const unsigned keyword_size, const unsigned long value_size )
-  {
-  // size = ' ' + keyword + '=' + value + '\n'
-  unsigned long long size = 1 + keyword_size + 1 + value_size + 1;
-  const unsigned d1 = decimal_digits( size );
-  size += decimal_digits( d1 + size );
-  if( size >= INT_MAX ) size = 0;		// overflows snprintf size
-  return size;
-  }
-
 bool write_extended( const Extended & extended )
  {
-  const int path_rec = extended.path.size() ?
-                       record_size( 4, extended.path.size() ) : 0;
-  const int lpath_rec = extended.linkpath.size() ?
-                        record_size( 8, extended.linkpath.size() ) : 0;
-  const int size_rec = ( extended.size > 0 ) ?
-                       record_size( 4, decimal_digits( extended.size ) ) : 0;
-  const unsigned long long edsize = path_rec + lpath_rec + size_rec + 22;
-  const unsigned long long bufsize = round_up( edsize );
+  const int path_rec = extended.recsize_path();
+  const int lpath_rec = extended.recsize_linkpath();
+  const int size_rec = extended.recsize_file_size();
+  const unsigned long long edsize = extended.edsize();
+  const unsigned long long bufsize = extended.edsize_pad();
  if( edsize >= 1ULL << 33 ) return false;	// too much extended data
  if( bufsize == 0 ) return edsize == 0;	// overflow or no extended data
  char * const buf = new char[bufsize+1];	// extended records buffer
-  unsigned long long pos = path_rec;		// goto can't cross this
+  unsigned long long pos = path_rec;		// goto can't cross these
+  const unsigned crc_size = Extended::crc_record.size();
+
  if( path_rec && snprintf( buf, path_rec + 1, "%d path=%s\n",
-                            path_rec, extended.path.c_str() ) != path_rec )
+                            path_rec, extended.path().c_str() ) != path_rec )
    goto error;
  if( lpath_rec && snprintf( buf + pos, lpath_rec + 1, "%d linkpath=%s\n",
-                     lpath_rec, extended.linkpath.c_str() ) != lpath_rec )
+                     lpath_rec, extended.linkpath().c_str() ) != lpath_rec )
    goto error;
  pos += lpath_rec;
  if( size_rec && snprintf( buf + pos, size_rec + 1, "%d size=%llu\n",
-                            size_rec, extended.size ) != size_rec )
+                            size_rec, extended.file_size() ) != size_rec )
    goto error;
  pos += size_rec;
-  if( snprintf( buf + pos, 23, "22 GNU.crc32=00000000\n" ) != 22 ) goto error;
-  pos += 22;
+  std::memcpy( buf + pos, Extended::crc_record.c_str(), crc_size );
+  pos += crc_size;
  if( pos != edsize ) goto error;
  print_hex( buf + edsize - 9, 8,
             crc32c.windowed_crc( (const uint8_t *)buf, edsize - 9, edsize ) );
@ -316,27 +300,29 @@ const char * remove_leading_dotdot( const char * const filename )
  }


-// Return true if filename fits in the ustar header.
+// Return true if it stores filename in the ustar header.
 bool store_name( const char * const filename, Extended & extended,
-                 Tar_header header )
+                 Tar_header header, const bool force_extended_name )
  {
  const char * const stored_name = remove_leading_dotdot( filename );
-  const int len = std::strlen( stored_name );
-  enum { max_len = prefix_l + 1 + name_l };	// prefix + '/' + name

-  // first try storing filename in the ustar header
-  if( len <= name_l )				// stored_name fits in name
-    { std::memcpy( header + name_o, stored_name, len ); return true; }
-  if( len <= max_len )				// find shortest prefix
-    for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i )
-      if( stored_name[i] == '/' )		// stored_name can be split
-        {
-        std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 );
-        std::memcpy( header + prefix_o, stored_name, i );
-        return true;
-        }
+  if( !force_extended_name )	// try storing filename in the ustar header
+    {
+    const int len = std::strlen( stored_name );
+    enum { max_len = prefix_l + 1 + name_l };	// prefix + '/' + name
+    if( len <= name_l )				// stored_name fits in name
+      { std::memcpy( header + name_o, stored_name, len ); return true; }
+    if( len <= max_len )			// find shortest prefix
+      for( int i = len - name_l - 1; i < len && i <= prefix_l; ++i )
+        if( stored_name[i] == '/' )		// stored_name can be split
+          {
+          std::memcpy( header + name_o, stored_name + i + 1, len - i - 1 );
+          std::memcpy( header + prefix_o, stored_name, i );
+          return true;
+          }
+    }
  // store filename in extended record, leave name zeroed in ustar header
-  extended.path = stored_name;
+  extended.path( stored_name );
  return false;
  }

@ -348,13 +334,13 @@ int add_member( const char * const filename, const struct stat *,
  if( lstat( filename, &st ) != 0 )
    { show_file_error( filename, "Can't stat input file", errno );
      gretval = 1; return 0; }
-  if( file_is_archive( st ) )
+  if( file_is_the_archive( st ) )
    { show_file_error( archive_namep, "File is the archive; not dumped." );
      return 0; }
  Extended extended;		// metadata for extended records
  Tar_header header;
  init_tar_header( header );
-  store_name( filename, extended, header );
+  bool force_extended_name = false;

  const mode_t mode = st.st_mode;
  print_octal( header + mode_o, mode_l - 1,
@ -392,7 +378,8 @@ int add_member( const char * const filename, const struct stat *,
      {
      char * const buf = new char[st.st_size+1];
      len = readlink( filename, buf, st.st_size );
-      if( len == st.st_size ) { buf[len] = 0; extended.linkpath = buf; }
+      if( len == st.st_size )
+        { buf[len] = 0; extended.linkpath( buf ); force_extended_name = true; }
      delete[] buf;
      }
    if( len != st.st_size )
@ -418,12 +405,30 @@ int add_member( const char * const filename, const struct stat *,
  const struct group * const gr = getgrgid( gid );
  if( gr && gr->gr_name )
    std::strncpy( (char *)header + gname_o, gr->gr_name, gname_l - 1 );
-  if( file_size >= 1ULL << 33 ) extended.size = file_size;
+  if( file_size >= 1ULL << 33 )
+    { extended.file_size( file_size ); force_extended_name = true; }
  else print_octal( header + size_o, size_l - 1, file_size );
+  store_name( filename, extended, header, force_extended_name );
  print_octal( header + chksum_o, chksum_l - 1, ustar_chksum( header ) );

  const int infd = file_size ? open_instream( filename ) : -1;
  if( file_size && infd < 0 ) { gretval = 1; return 0; }
+  if( encoder && solidity == bsolid )
+    {
+    const unsigned long long member_size =
+      header_size + extended.full_size() + round_up( file_size );
+    const unsigned long long target_size = cl_data_size;
+    if( partial_data_size >= target_size ||
+        ( partial_data_size >= min_data_size &&
+          partial_data_size + member_size / 2 > target_size ) )
+      {
+      partial_data_size = member_size;
+      if( !archive_write( 0, 0 ) )
+        { show_error( "Error flushing encoder", errno ); return 1; }
+      }
+    else partial_data_size += member_size;
+    }
+
  if( !extended.empty() && !write_extended( extended ) )
    { show_error( "Error writing extended header", errno ); return 1; }
  if( !archive_write( header, header_size ) )
@ -491,7 +496,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser,
    { show_error( "'--concatenate' is incompatible with '-f -'.", 0, true );
      return 1; }
  if( ( outfd = open_outstream( archive_name, false ) ) < 0 ) return 1;
-  if( !file_is_archive.init() )
+  if( !file_is_the_archive.init( outfd ) )
    { show_file_error( archive_name.c_str(), "Can't stat", errno ); return 1; }

  int retval = 0;
@ -507,7 +512,7 @@ int concatenate( const std::string & archive_name, const Arg_parser & parser,
      { show_file_error( filename, "Not an appendable tar.lz archive." );
        close( infd ); retval = 2; break; }
    struct stat st;
-    if( fstat( infd, &st ) == 0 && file_is_archive( st ) )
+    if( fstat( infd, &st ) == 0 && file_is_the_archive( st ) )
      { show_file_error( filename, "File is the archive; not concatenated." );
        close( infd ); continue; }
    if( !check_appendable( outfd, true ) )
@ -572,12 +577,18 @@ int encode( const std::string & archive_name, const Arg_parser & parser,
    }

  archive_namep = archive_name.size() ? archive_name.c_str() : "(stdout)";
-  if( !file_is_archive.init() )
+  if( !file_is_the_archive.init( outfd ) )
    { show_file_error( archive_namep, "Can't stat", errno ); return 1; }

  if( compressed )
    {
-    encoder = LZ_compress_open( option_mapping[level].dictionary_size,
+    const int dictionary_size = option_mapping[level].dictionary_size;
+    if( cl_data_size <= 0 )
+      {
+      if( level == 0 ) cl_data_size = 1 << 20;
+      else cl_data_size = 2 * dictionary_size;
+      }
+    encoder = LZ_compress_open( dictionary_size,
                option_mapping[level].match_len_limit, LLONG_MAX );
    if( !encoder || LZ_compress_errno( encoder ) != LZ_ok )
      {
@ -619,7 +630,8 @@ int encode( const std::string & archive_name, const Arg_parser & parser,
    enum { bufsize = 2 * header_size };
    uint8_t buf[bufsize];
    std::memset( buf, 0, bufsize );
-    if( encoder && solidity == asolid && !archive_write( 0, 0 ) )
+    if( encoder && ( solidity == asolid || solidity == bsolid ) &&
+        !archive_write( 0, 0 ) )
      { show_error( "Error flushing encoder", errno ); retval = 1; }
    else if( !archive_write( buf, bufsize ) ||
             ( encoder && !archive_write( 0, 0 ) ) )	// flush encoder
--- a/doc/tarlz.1
+++ b/doc/tarlz.1
@ -1,5 +1,5 @@
 .\" DO NOT MODIFY THIS FILE!  It was generated by help2man 1.46.1.
-.TH TARLZ "1" "January 2019" "tarlz 0.9" "User Commands"
+.TH TARLZ "1" "January 2019" "tarlz 0.10" "User Commands"
 .SH NAME
 tarlz \- creates tar archives with multimember lzip compression
 .SH SYNOPSIS
@ -33,6 +33,9 @@ output version information and exit
 \fB\-A\fR, \fB\-\-concatenate\fR
 append tar.lz archives to the end of an archive
 .TP
+\fB\-B\fR, \fB\-\-data\-size=\fR<bytes>
+set target size of input data blocks [2x8=16 MiB]
+.TP
 \fB\-c\fR, \fB\-\-create\fR
 create a new archive
 .TP
@ -66,6 +69,9 @@ set compression level [default 6]
 \fB\-\-asolid\fR
 create solidly compressed appendable archive
 .TP
+\fB\-\-bsolid\fR
+create per\-data\-block compressed archive
+.TP
 \fB\-\-dsolid\fR
 create per\-directory compressed archive
 .TP
--- a/doc/tarlz.info
+++ b/doc/tarlz.info
@ -11,7 +11,7 @@ File: tarlz.info,  Node: Top,  Next: Introduction,  Up: (dir)
 Tarlz Manual
 ************

-This manual is for Tarlz (version 0.9, 22 January 2019).
+This manual is for Tarlz (version 0.10, 31 January 2019).

 * Menu:

@ -120,6 +120,13 @@ archive 'foo'.
     the archive if no FILES have been specified. Tarlz can't
     concatenate uncompressed tar archives.

+'-B BYTES'
+'--data-size=BYTES'
+     Set target size of input data blocks for the '--bsolid' option.
+     Valid values range from 8 KiB to 1 GiB. Default value is two times
+     the dictionary size, except for option '-0' where it defaults to
+     1 MiB.
+
 '-c'
 '--create'
     Create a new archive from FILES.
@ -190,6 +197,18 @@ archive 'foo'.
     members it creates, reducing the amount of memory required for
     decompression.

+     Level   Dictionary size   Match length limit
+     -0      64 KiB            16 bytes
+     -1      1 MiB             5 bytes
+     -2      1.5 MiB           6 bytes
+     -3      2 MiB             8 bytes
+     -4      3 MiB             12 bytes
+     -5      4 MiB             20 bytes
+     -6      8 MiB             36 bytes
+     -7      16 MiB            68 bytes
+     -8      24 MiB            132 bytes
+     -9      32 MiB            273 bytes
+
 '--asolid'
     When creating or appending to a compressed archive, use appendable
     solid compression. All the files being added to the archive are
@ -197,6 +216,15 @@ archive 'foo'.
     are compressed into a separate lzip member. This creates a solidly
     compressed appendable archive.

+'--bsolid'
+     When creating or appending to a compressed archive, compress tar
+     members together in a lzip member until they approximate a target
+     uncompressed size.  The size can't be exact because each solidly
+     compressed data block must contain an integer number of tar
+     members. This option improves compression efficiency for archives
+     with lots of small files. *Note --data-size::, to set the target
+     block size.
+
 '--dsolid'
     When creating or appending to a compressed archive, use solid
     compression for each directory especified in the command line. The
@ -560,13 +588,13 @@ old tar programs from extracting the extended records as a file in the
 wrong place.  Tarlz also sets to zero those fields of the ustar header
 overridden by extended records.

-   If the extended header is needed because of a file size larger than
-8 GiB, the size field will be unable to contain the full size of the
-file. Therefore the file may be partially extracted, and the tool will
-issue a spurious warning about a corrupt header at the point where it
-thinks the file ends. Setting to zero the overridden size in the ustar
-header at least prevents the partial extraction and makes obvious that
-the file has been truncated.
+   If an extended header is required for any reason (for example a file
+size larger than 8 GiB or a link name longer than 100 bytes), tarlz
+moves the filename also to the extended header to prevent an ustar tool
+from trying to extract the file or link. This also makes easier during
+parallel extraction or listing the detection of a tar member split
+between two lzip members at the boundary between the extended header
+and the ustar header.


 4.3 As simple as possible (but not simpler)
@ -626,10 +654,10 @@ to single-threaded mode and continues decoding the archive. Currently
 only the '--list' option is able to do multi-threaded decoding.

   If the files in the archive are large, multi-threaded '--list' on a
-regular tar.lz archive can be hundreds of times faster than sequential
-'--list' because, in addition to using several processors, it only
-needs to decompress part of each lzip member. See the following example
-listing the Silesia corpus on a dual core machine:
+regular (seekable) tar.lz archive can be hundreds of times faster than
+sequential '--list' because, in addition to using several processors,
+it only needs to decompress part of each lzip member. See the following
+example listing the Silesia corpus on a dual core machine:

     tarlz -9 -cf silesia.tar.lz silesia
     time lzip -cd silesia.tar.lz | tar -tf -            (5.032s)
@ -690,9 +718,9 @@ Example 7: Extract files 'a' and 'c' from archive 'archive.tar.lz'.


 Example 8: Copy the contents of directory 'sourcedir' to the directory
-'targetdir'.
+'destdir'.

-     tarlz -C sourcedir -c . | tarlz -C targetdir -x
+     tarlz -C sourcedir -c . | tarlz -C destdir -x


 File: tarlz.info,  Node: Problems,  Next: Concept index,  Prev: Examples,  Up: Top
@ -734,17 +762,18 @@ Concept index

 Tag Table:
 Node: Top223
-Node: Introduction1012
-Node: Invoking tarlz3124
-Node: File format10384
-Ref: key_crc3215169
-Node: Amendments to pax format20586
-Ref: crc3221110
-Ref: flawed-compat22135
-Node: Multi-threaded tar24508
-Node: Examples27012
-Node: Problems28682
-Node: Concept index29208
+Node: Introduction1013
+Node: Invoking tarlz3125
+Ref: --data-size4717
+Node: File format11536
+Ref: key_crc3216321
+Node: Amendments to pax format21738
+Ref: crc3222262
+Ref: flawed-compat23287
+Node: Multi-threaded tar25649
+Node: Examples28164
+Node: Problems29830
+Node: Concept index30356

 End Tag Table

--- a/doc/tarlz.texi
+++ b/doc/tarlz.texi
@ -6,8 +6,8 @@
@finalout
@c %**end of header

-@set UPDATED 22 January 2019
-@set VERSION 0.9
+@set UPDATED 31 January 2019
+@set VERSION 0.10

@dircategory Data Compression
@direntry
@ -89,7 +89,7 @@ member) just like to an uncompressed tar archive.
 It is a safe posix-style backup format. In case of corruption,
 tarlz can extract all the undamaged members from the tar.lz
 archive, skipping over the damaged members, just like the standard
-(uncompressed) tar. Moreover, the option @code{--keep-damaged} can be
+(uncompressed) tar. Moreover, the option @samp{--keep-damaged} can be
 used to recover as much data as possible from each damaged member,
 and lziprecover can be used to recover some of the damaged members.

@ -154,6 +154,13 @@ end-of-file blocks are removed as each new archive is concatenated. Exit
 with status 0 without modifying the archive if no @var{files} have been
 specified. Tarlz can't concatenate uncompressed tar archives.

+@anchor{--data-size}
+@item -B @var{bytes}
+@itemx --data-size=@var{bytes}
+Set target size of input data blocks for the @samp{--bsolid} option. Valid
+values range from @w{8 KiB} to @w{1 GiB}. Default value is two times the
+dictionary size, except for option @samp{-0} where it defaults to @w{1 MiB}.
+
@item -c
@itemx --create
 Create a new archive from @var{files}.
@ -161,13 +168,13 @@ Create a new archive from @var{files}.
@item -C @var{dir}
@itemx --directory=@var{dir}
 Change to directory @var{dir}. When creating or appending, the position
-of each @code{-C} option in the command line is significant; it will
+of each @samp{-C} option in the command line is significant; it will
 change the current working directory for the following @var{files} until
-a new @code{-C} option appears in the command line. When extracting, all
-the @code{-C} options are executed in sequence before starting the
-extraction. Listing ignores any @code{-C} options specified. @var{dir}
+a new @samp{-C} option appears in the command line. When extracting, all
+the @samp{-C} options are executed in sequence before starting the
+extraction. Listing ignores any @samp{-C} options specified. @var{dir}
 is relative to the then current working directory, perhaps changed by a
-previous @code{-C} option.
+previous @samp{-C} option.

@item -f @var{archive}
@itemx --file=@var{archive}
@ -222,6 +229,20 @@ Set the compression level. The default compression level is @samp{-6}.
 Like lzip, tarlz also minimizes the dictionary size of the lzip members
 it creates, reducing the amount of memory required for decompression.

+@multitable {Level} {Dictionary size} {Match length limit}
+@item Level @tab Dictionary size @tab Match length limit
+@item -0 @tab 64 KiB @tab  16 bytes
+@item -1 @tab  1 MiB @tab   5 bytes
+@item -2 @tab  1.5 MiB @tab   6 bytes
+@item -3 @tab  2 MiB @tab   8 bytes
+@item -4 @tab  3 MiB @tab  12 bytes
+@item -5 @tab  4 MiB @tab  20 bytes
+@item -6 @tab  8 MiB @tab  36 bytes
+@item -7 @tab 16 MiB @tab  68 bytes
+@item -8 @tab 24 MiB @tab 132 bytes
+@item -9 @tab 32 MiB @tab 273 bytes
+@end multitable
+
@item --asolid
 When creating or appending to a compressed archive, use appendable solid
 compression. All the files being added to the archive are compressed
@ -229,6 +250,14 @@ into a single lzip member, but the end-of-file blocks are compressed
 into a separate lzip member. This creates a solidly compressed
 appendable archive.

+@item --bsolid
+When creating or appending to a compressed archive, compress tar members
+together in a lzip member until they approximate a target uncompressed size.
+The size can't be exact because each solidly compressed data block must
+contain an integer number of tar members. This option improves compression
+efficiency for archives with lots of small files. @xref{--data-size}, to set
+the target block size.
+
@item --dsolid
 When creating or appending to a compressed archive, use solid
 compression for each directory especified in the command line. The
@ -252,7 +281,7 @@ resulting archive is not appendable. No more files can be later appended
 to the archive.

@item --anonymous
-Equivalent to @code{--owner=root --group=root}.
+Equivalent to @samp{--owner=root --group=root}.

@item --owner=@var{owner}
 When creating or appending, use @var{owner} for files added to the
@ -287,7 +316,7 @@ keyword appearing in the same block of extended records.
@end ignore

@item --uncompressed
-With @code{--create}, don't compress the created tar archive. Create an
+With @samp{--create}, don't compress the created tar archive. Create an
 uncompressed tar archive instead.

@end table
@ -350,7 +379,7 @@ Zero or more blocks that contain the contents of the file.
@end itemize

 Each tar member must be contiguously stored in a lzip member for the
-parallel decoding operations like @code{--list} to work. If any tar member
+parallel decoding operations like @samp{--list} to work. If any tar member
 is split over two or more lzip members, the archive must be decoded
 sequentially. @xref{Multi-threaded tar}.

@ -381,7 +410,7 @@ tar.lz
@end verbatim

@ignore
-When @code{--permissive} is used, the following violations of the
+When @samp{--permissive} is used, the following violations of the
 archive format are allowed:@*
 If several extended headers precede an ustar header, only the last
 extended header takes effect. The other extended headers are ignored.
@ -623,13 +652,12 @@ programs from extracting the extended records as a file in the wrong place.
 Tarlz also sets to zero those fields of the ustar header overridden by
 extended records.

-If the extended header is needed because of a file size larger than
-@w{8 GiB}, the size field will be unable to contain the full size of the
-file. Therefore the file may be partially extracted, and the tool will issue
-a spurious warning about a corrupt header at the point where it thinks the
-file ends. Setting to zero the overridden size in the ustar header at least
-prevents the partial extraction and makes obvious that the file has been
-truncated.
+If an extended header is required for any reason (for example a file size
+larger than @w{8 GiB} or a link name longer than 100 bytes), tarlz moves the
+filename also to the extended header to prevent an ustar tool from trying to
+extract the file or link. This also makes easier during parallel extraction
+or listing the detection of a tar member split between two lzip members at
+the boundary between the extended header and the ustar header.

@sp 1
@section As simple as possible (but not simpler)
@ -679,14 +707,14 @@ decoding it safely in parallel.
 Tarlz is able to automatically decode aligned and unaligned multimember
 tar.lz archives, keeping backwards compatibility. If tarlz finds a member
 misalignment during multi-threaded decoding, it switches to single-threaded
-mode and continues decoding the archive. Currently only the @code{--list}
+mode and continues decoding the archive. Currently only the @samp{--list}
 option is able to do multi-threaded decoding.

-If the files in the archive are large, multi-threaded @code{--list} on a
-regular tar.lz archive can be hundreds of times faster than sequential
-@code{--list} because, in addition to using several processors, it only
-needs to decompress part of each lzip member. See the following example
-listing the Silesia corpus on a dual core machine:
+If the files in the archive are large, multi-threaded @samp{--list} on a
+regular (seekable) tar.lz archive can be hundreds of times faster than
+sequential @samp{--list} because, in addition to using several processors,
+it only needs to decompress part of each lzip member. See the following
+example listing the Silesia corpus on a dual core machine:

@example
 tarlz -9 -cf silesia.tar.lz silesia
@ -772,10 +800,10 @@ tarlz -xf archive.tar.lz a c
@sp 1
@noindent
 Example 8: Copy the contents of directory @samp{sourcedir} to the
-directory @samp{targetdir}.
+directory @samp{destdir}.

@example
-tarlz -C sourcedir -c . | tarlz -C targetdir -x
+tarlz -C sourcedir -c . | tarlz -C destdir -x
@end example


--- a/extended.cc
+++ b/extended.cc
@ -0,0 +1,156 @@
+/*  Tarlz - Archiver with multimember lzip compression
+    Copyright (C) 2013-2019 Antonio Diaz Diaz.
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <cctype>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+#include "tarlz.h"
+
+
+namespace {
+
+unsigned decimal_digits( unsigned long long value )
+  {
+  unsigned digits = 1;
+  while( value >= 10 ) { value /= 10; ++digits; }
+  return digits;
+  }
+
+
+int record_size( const unsigned keyword_size, const unsigned long value_size )
+  {
+  // size = ' ' + keyword + '=' + value + '\n'
+  unsigned long long size = 1 + keyword_size + 1 + value_size + 1;
+  const unsigned d1 = decimal_digits( size );
+  size += decimal_digits( d1 + size );
+  if( size >= INT_MAX ) size = 0;		// overflows snprintf size
+  return size;
+  }
+
+
+unsigned long long parse_decimal( const char * const ptr,
+                                  const char ** const tailp,
+                                  const unsigned long long size )
+  {
+  unsigned long long result = 0;
+  unsigned long long i = 0;
+  while( i < size && std::isspace( ptr[i] ) ) ++i;
+  if( !std::isdigit( (unsigned char)ptr[i] ) )
+    { if( tailp ) *tailp = ptr; return 0; }
+  for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i )
+    {
+    const unsigned long long prev = result;
+    result *= 10; result += ptr[i] - '0';
+    if( result < prev || result > LLONG_MAX )		// overflow
+      { if( tailp ) *tailp = ptr; return 0; }
+    }
+  if( tailp ) *tailp = ptr + i;
+  return result;
+  }
+
+
+uint32_t parse_record_crc( const char * const ptr )
+  {
+  uint32_t crc = 0;
+  for( int i = 0; i < 8; ++i )
+    {
+    crc <<= 4;
+    if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0';
+    else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A';
+    else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a';
+    else { crc = 0; break; }		// invalid digit in crc string
+    }
+  return crc;
+  }
+
+} // end namespace
+
+
+const std::string Extended::crc_record( "22 GNU.crc32=00000000\n" );
+
+int Extended::recsize_linkpath() const
+  {
+  if( recsize_linkpath_ < 0 ) recsize_linkpath_ =
+    linkpath_.size() ? record_size( 8, linkpath_.size() ) : 0;
+  return recsize_linkpath_;
+  }
+
+int Extended::recsize_path() const
+  {
+  if( recsize_path_ < 0 )
+    recsize_path_ = path_.size() ? record_size( 4, path_.size() ) : 0;
+  return recsize_path_;
+  }
+
+int Extended::recsize_file_size() const
+  {
+  if( recsize_file_size_ < 0 ) recsize_file_size_ =
+    ( file_size_ > 0 ) ? record_size( 4, file_size_ ) : 0;
+  return recsize_file_size_;
+  }
+
+
+bool Extended::parse( const char * const buf, const unsigned long long edsize,
+                      const bool permissive )
+  {
+  reset();
+  for( unsigned long long pos = 0; pos < edsize; )	// parse records
+    {
+    const char * tail;
+    const unsigned long long rsize =
+      parse_decimal( buf + pos, &tail, edsize - pos );
+    if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' ||
+        buf[pos+rsize-1] != '\n' ) return false;
+    ++tail;	// point to keyword
+    // rest = length of (keyword + '=' + value) without the final newline
+    const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail;
+    if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 )
+      { if( path_.size() && !permissive ) return false;
+        path_.assign( tail + 5, rest - 5 ); }
+    else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 )
+      { if( linkpath_.size() && !permissive ) return false;
+        linkpath_.assign( tail + 9, rest - 9 ); }
+    else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 )
+      {
+      if( file_size_ != 0 && !permissive ) return false;
+      file_size_ = parse_decimal( tail + 5, &tail, rest - 5 );
+      // parse error or size fits in ustar header
+      if( file_size_ < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) )
+        return false;
+      }
+    else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 )
+      {
+      if( crc_present_ && !permissive ) return false;
+      if( rsize != crc_record.size() ) return false;
+      const uint32_t stored_crc = parse_record_crc( tail + 10 );
+      const uint32_t computed_crc =
+        crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize );
+      crc_present_ = true;
+      if( stored_crc != computed_crc ) return false;
+      }
+    pos += rsize;
+    }
+  full_size_ = header_size + round_up( edsize );
+  return true;
+  }
--- a/extract.cc
+++ b/extract.cc
@ -37,7 +37,6 @@
 #include <lzlib.h>

 #include "arg_parser.h"
-#include "lzip.h"
 #include "lzip_index.h"
 #include "tarlz.h"

@ -268,19 +267,19 @@ void format_member_name( const Extended & extended, const Tar_header header,
    for( int i = 0; i < 2; ++i )
      {
      const int len = snprintf( rbuf() + offset, rbuf.size() - offset,
-                  " %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n",
-                  extended.size, 1900 + tm->tm_year, 1 + tm->tm_mon,
-                  tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path.c_str(),
-                  link_string, !islink ? "" : extended.linkpath.c_str() );
+                " %9llu %4d-%02u-%02u %02u:%02u %s%s%s\n",
+                extended.file_size(), 1900 + tm->tm_year, 1 + tm->tm_mon,
+                tm->tm_mday, tm->tm_hour, tm->tm_min, extended.path().c_str(),
+                link_string, !islink ? "" : extended.linkpath().c_str() );
      if( (int)rbuf.size() > len + offset ) break;
      else rbuf.resize( len + offset + 1 );
      }
    }
  else
    {
-    if( rbuf.size() < extended.path.size() + 2 )
-      rbuf.resize( extended.path.size() + 2 );
-    snprintf( rbuf(), rbuf.size(), "%s\n", extended.path.c_str() );
+    if( rbuf.size() < extended.path().size() + 2 )
+      rbuf.resize( extended.path().size() + 2 );
+    snprintf( rbuf(), rbuf.size(), "%s\n", extended.path().c_str() );
    }
  }

@ -303,8 +302,8 @@ int list_member( const int infd, const Extended & extended,

  const unsigned bufsize = 32 * header_size;
  uint8_t buf[bufsize];
-  unsigned long long rest = extended.size;
-  const int rem = extended.size % header_size;
+  unsigned long long rest = extended.file_size();
+  const int rem = rest % header_size;
  const int padding = rem ? header_size - rem : 0;
  while( rest > 0 )
    {
@ -331,7 +330,7 @@ bool contains_dotdot( const char * const filename )
 int extract_member( const int infd, const Extended & extended,
                    const Tar_header header, const bool keep_damaged )
  {
-  const char * const filename = extended.path.c_str();
+  const char * const filename = extended.path().c_str();
  if( contains_dotdot( filename ) )
    {
    show_file_error( filename, "Contains a '..' component, skipping." );
@ -357,7 +356,7 @@ int extract_member( const int infd, const Extended & extended,
    case tf_link:
    case tf_symlink:
      {
-      const char * const linkname = extended.linkpath.c_str();
+      const char * const linkname = extended.linkpath().c_str();
 /*      if( contains_dotdot( linkname ) )
        {
        show_file_error( filename,
@ -421,8 +420,8 @@ int extract_member( const int infd, const Extended & extended,

  const unsigned bufsize = 32 * header_size;
  uint8_t buf[bufsize];
-  unsigned long long rest = extended.size;
-  const int rem = extended.size % header_size;
+  unsigned long long rest = extended.file_size();
+  const int rem = rest % header_size;
  const int padding = rem ? header_size - rem : 0;
  while( rest > 0 )
    {
@ -501,42 +500,6 @@ bool compare_tslash( const char * const name1, const char * const name2 )

 namespace {

-unsigned long long parse_decimal( const char * const ptr,
-                                  const char ** const tailp,
-                                  const unsigned long long size )
-  {
-  unsigned long long result = 0;
-  unsigned long long i = 0;
-  while( i < size && std::isspace( ptr[i] ) ) ++i;
-  if( !std::isdigit( (unsigned char)ptr[i] ) )
-    { if( tailp ) *tailp = ptr; return 0; }
-  for( ; i < size && std::isdigit( (unsigned char)ptr[i] ); ++i )
-    {
-    const unsigned long long prev = result;
-    result *= 10; result += ptr[i] - '0';
-    if( result < prev || result > LLONG_MAX )		// overflow
-      { if( tailp ) *tailp = ptr; return 0; }
-    }
-  if( tailp ) *tailp = ptr + i;
-  return result;
-  }
-
-
-uint32_t parse_record_crc( const char * const ptr )
-  {
-  uint32_t crc = 0;
-  for( int i = 0; i < 8; ++i )
-    {
-    crc <<= 4;
-    if( ptr[i] >= '0' && ptr[i] <= '9' ) crc += ptr[i] - '0';
-    else if( ptr[i] >= 'A' && ptr[i] <= 'F' ) crc += ptr[i] + 10 - 'A';
-    else if( ptr[i] >= 'a' && ptr[i] <= 'f' ) crc += ptr[i] + 10 - 'a';
-    else { crc = 0; break; }		// invalid digit in crc string
-    }
-  return crc;
-  }
-
-
 bool parse_records( const int infd, Extended & extended,
                    const Tar_header header, const bool permissive )
  {
@ -602,48 +565,6 @@ unsigned long long parse_octal( const uint8_t * const ptr, const int size )
  }


-bool Extended::parse( const char * const buf, const unsigned long long edsize,
-                      const bool permissive )
-  {
-  for( unsigned long long pos = 0; pos < edsize; )	// parse records
-    {
-    const char * tail;
-    const unsigned long long rsize =
-      parse_decimal( buf + pos, &tail, edsize - pos );
-    if( rsize == 0 || rsize > edsize - pos || tail[0] != ' ' ||
-        buf[pos+rsize-1] != '\n' ) return false;
-    ++tail;	// point to keyword
-    // rest = length of (keyword + '=' + value) without the final newline
-    const unsigned long long rest = ( buf + ( pos + rsize - 1 ) ) - tail;
-    if( rest > 5 && std::memcmp( tail, "path=", 5 ) == 0 )
-      { if( path.size() && !permissive ) return false;
-        path.assign( tail + 5, rest - 5 ); }
-    else if( rest > 9 && std::memcmp( tail, "linkpath=", 9 ) == 0 )
-      { if( linkpath.size() && !permissive ) return false;
-        linkpath.assign( tail + 9, rest - 9 ); }
-    else if( rest > 5 && std::memcmp( tail, "size=", 5 ) == 0 )
-      {
-      if( size != 0 && !permissive ) return false;
-      size = parse_decimal( tail + 5, &tail, rest - 5 );
-      // parse error or size fits in ustar header
-      if( size < 1ULL << 33 || tail != buf + ( pos + rsize - 1 ) ) return false;
-      }
-    else if( rest > 10 && std::memcmp( tail, "GNU.crc32=", 10 ) == 0 )
-      {
-      if( crc_present && !permissive ) return false;
-      if( rsize != 22 ) return false;
-      const uint32_t stored_crc = parse_record_crc( tail + 10 );
-      const uint32_t computed_crc =
-        crc32c.windowed_crc( (const uint8_t *)buf, pos + rsize - 9, edsize );
-      crc_present = true;
-      if( stored_crc != computed_crc ) return false;
-      }
-    pos += rsize;
-    }
-  return true;
-  }
-
-
 int decode( const std::string & archive_name, const Arg_parser & parser,
            const int filenames, const int num_workers, const int debug_level,
            const bool keep_damaged, const bool listing, const bool missing_crc,
@ -722,23 +643,27 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
      if( !parse_records( infd, extended, header, permissive ) )
        { show_error( "Error in extended records. Skipping to next header." );
          extended.reset(); gretval = 2; }
-      else if( !extended.crc_present && missing_crc )
+      else if( !extended.crc_present() && missing_crc )
        { show_error( "Missing CRC in extended records.", 0, true ); return 2; }
      prev_extended = true;
      continue;
      }
    prev_extended = false;

-    if( extended.linkpath.empty() )	// copy linkpath from ustar header
+    if( extended.linkpath().empty() )	// copy linkpath from ustar header
      {
-      for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i )
-        extended.linkpath += header[linkname_o+i];
-      while( extended.linkpath.size() > 1 &&		// trailing '/'
-             extended.linkpath[extended.linkpath.size()-1] == '/' )
-        extended.linkpath.resize( extended.linkpath.size() - 1 );
+      int len = 0;
+      while( len < linkname_l && header[linkname_o+len] ) ++len;
+      while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/'
+      if( len > 0 )
+        {
+        const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0;
+        extended.linkpath( (const char *)header + linkname_o );
+        header[linkname_o+len] = c;
+        }
      }

-    if( extended.path.empty() )		// copy path from ustar header
+    if( extended.path().empty() )		// copy path from ustar header
      {
      char stored_name[prefix_l+1+name_l+1];
      int len = 0;
@ -749,9 +674,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
        { stored_name[len] = header[name_o+i]; ++len; }
      while( len > 0 && stored_name[len-1] == '/' ) --len;	// trailing '/'
      stored_name[len] = 0;
-      extended.path = remove_leading_slash( stored_name );
+      extended.path( remove_leading_slash( stored_name ) );
      }
-    const char * const filename = extended.path.c_str();
+    const char * const filename = extended.path().c_str();

    bool skip = filenames > 0;
    if( skip )
@ -765,9 +690,9 @@ int decode( const std::string & archive_name, const Arg_parser & parser,
            { skip = false; name_pending[i] = false; break; }
          }

-    if( extended.size == 0 &&
+    if( extended.file_size() == 0 &&
        ( typeflag == tf_regular || typeflag == tf_hiperf ) )
-      extended.size = parse_octal( header + size_o, size_l );
+      extended.file_size( parse_octal( header + size_o, size_l ) );

    if( listing || skip )
      retval = list_member( infd, extended, header, skip );
--- a/list_lz.cc
+++ b/list_lz.cc
@ -32,7 +32,6 @@
 #include <lzlib.h>

 #include "arg_parser.h"
-#include "lzip.h"
 #include "lzip_index.h"
 #include "tarlz.h"

@ -355,8 +354,8 @@ int list_member_lz( LZ_Decoder * const decoder, const int infd,
                    Resizable_buffer & rbuf, const long member_id,
                    const int worker_id, const char ** msg, const bool skip )
  {
-  unsigned long long rest = extended.size;
-  const int rem = extended.size % header_size;
+  unsigned long long rest = extended.file_size();
+  const int rem = rest % header_size;
  const int padding = rem ? header_size - rem : 0;
  const long long data_rest = mdata_end - ( data_pos + rest + padding );
  bool master = false;
@ -527,7 +526,7 @@ extern "C" void * dworker_l( void * arg )
            ret = 2; }
        else ret = parse_records_lz( decoder, infd, file_pos, member_end,
                          cdata_size, data_pos, extended, header, &msg, permissive );
-        if( ret == 0 && !extended.crc_present && missing_crc )
+        if( ret == 0 && !extended.crc_present() && missing_crc )
          { msg = "Missing CRC in extended records."; ret = 2; }
        if( ret != 0 )
          {
@ -549,16 +548,20 @@ extern "C" void * dworker_l( void * arg )
        }
      prev_extended = false;

-      if( extended.linkpath.empty() )	// copy linkpath from ustar header
+      if( extended.linkpath().empty() )	// copy linkpath from ustar header
        {
-        for( int i = 0; i < linkname_l && header[linkname_o+i]; ++i )
-          extended.linkpath += header[linkname_o+i];
-        while( extended.linkpath.size() > 1 &&		// trailing '/'
-               extended.linkpath[extended.linkpath.size()-1] == '/' )
-          extended.linkpath.resize( extended.linkpath.size() - 1 );
+        int len = 0;
+        while( len < linkname_l && header[linkname_o+len] ) ++len;
+        while( len > 1 && header[linkname_o+len-1] == '/' ) --len; // trailing '/'
+        if( len > 0 )
+          {
+          const uint8_t c = header[linkname_o+len]; header[linkname_o+len] = 0;
+          extended.linkpath( (const char *)header + linkname_o );
+          header[linkname_o+len] = c;
+          }
        }

-      if( extended.path.empty() )		// copy path from ustar header
+      if( extended.path().empty() )		// copy path from ustar header
        {
        char stored_name[prefix_l+1+name_l+1];
        int len = 0;
@ -569,9 +572,9 @@ extern "C" void * dworker_l( void * arg )
          { stored_name[len] = header[name_o+i]; ++len; }
        while( len > 0 && stored_name[len-1] == '/' ) --len;	// trailing '/'
        stored_name[len] = 0;
-        extended.path = remove_leading_slash( stored_name );
+        extended.path( remove_leading_slash( stored_name ) );
        }
-      const char * const filename = extended.path.c_str();
+      const char * const filename = extended.path().c_str();

      bool skip = filenames > 0;
      if( skip )
@ -585,9 +588,9 @@ extern "C" void * dworker_l( void * arg )
              { skip = false; name_pending[i] = false; break; }
            }

-      if( extended.size == 0 &&
+      if( extended.file_size() == 0 &&
          ( typeflag == tf_regular || typeflag == tf_hiperf ) )
-        extended.size = parse_octal( header + size_o, size_l );
+        extended.file_size( parse_octal( header + size_o, size_l ) );

      retval = list_member_lz( decoder, infd, file_pos, member_end,
                               cdata_size, data_pos, mdata_end, courier,
@ -643,7 +646,7 @@ int list_lz( const Arg_parser & parser, std::vector< char > & name_pending,
             const int debug_level, const int infd, const int num_workers,
             const bool missing_crc, const bool permissive )
  {
-  const int out_slots = 100;
+  const int out_slots = 65536;		// max small files (<=512B) in 64 MiB
  Packet_courier courier( num_workers, out_slots );

  Worker_arg * worker_args = new( std::nothrow ) Worker_arg[num_workers];
--- a/lzip.h
+++ b/lzip.h
@ -1,146 +0,0 @@
-/*  Tarlz - Archiver with multimember lzip compression
-    Copyright (C) 2013-2019 Antonio Diaz Diaz.
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef LZ_API_VERSION
-#define LZ_API_VERSION 1
-#endif
-
-enum {
-  min_dictionary_bits = 12,
-  min_dictionary_size = 1 << min_dictionary_bits,
-  max_dictionary_bits = 29,
-  max_dictionary_size = 1 << max_dictionary_bits,
-  min_member_size = 36 };
-
-
-class CRC32
-  {
-  uint32_t data[256];		// Table of CRCs of all 8-bit messages.
-
-public:
-  CRC32()
-    {
-    for( unsigned n = 0; n < 256; ++n )
-      {
-      unsigned c = n;
-      for( int k = 0; k < 8; ++k )
-        { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
-      data[n] = c;
-      }
-    }
-
-  void update_byte( uint32_t & crc, const uint8_t byte ) const
-    { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); }
-  };
-
-
-inline bool isvalid_ds( const unsigned dictionary_size )
-  { return ( dictionary_size >= min_dictionary_size &&
-             dictionary_size <= max_dictionary_size ); }
-
-
-const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 };	// "LZIP"
-
-struct Lzip_header
-  {
-  uint8_t data[6];			// 0-3 magic bytes
-					//   4 version
-					//   5 coded_dict_size
-  enum { size = 6 };
-
-  bool verify_magic() const
-    { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); }
-
-  bool verify_prefix( const int sz ) const	// detect (truncated) header
-    {
-    for( int i = 0; i < sz && i < 4; ++i )
-      if( data[i] != lzip_magic[i] ) return false;
-    return ( sz > 0 );
-    }
-  bool verify_corrupt() const			// detect corrupt header
-    {
-    int matches = 0;
-    for( int i = 0; i < 4; ++i )
-      if( data[i] == lzip_magic[i] ) ++matches;
-    return ( matches > 1 && matches < 4 );
-    }
-
-  uint8_t version() const { return data[4]; }
-  bool verify_version() const { return ( data[4] == 1 ); }
-
-  unsigned dictionary_size() const
-    {
-    unsigned sz = ( 1 << ( data[5] & 0x1F ) );
-    if( sz > min_dictionary_size )
-      sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 );
-    return sz;
-    }
-  };
-
-
-struct Lzip_trailer
-  {
-  uint8_t data[20];	//  0-3  CRC32 of the uncompressed data
-			//  4-11 size of the uncompressed data
-			// 12-19 member size including header and trailer
-  enum { size = 20 };
-
-  unsigned data_crc() const
-    {
-    unsigned tmp = 0;
-    for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; }
-    return tmp;
-    }
-
-  unsigned long long data_size() const
-    {
-    unsigned long long tmp = 0;
-    for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; }
-    return tmp;
-    }
-
-  unsigned long long member_size() const
-    {
-    unsigned long long tmp = 0;
-    for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; }
-    return tmp;
-    }
-
-  bool verify_consistency() const	// check internal consistency
-    {
-    const unsigned crc = data_crc();
-    const unsigned long long dsize = data_size();
-    if( ( crc == 0 ) != ( dsize == 0 ) ) return false;
-    const unsigned long long msize = member_size();
-    if( msize < min_member_size ) return false;
-    const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size;
-    if( mlimit > dsize && msize > mlimit ) return false;
-    const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1;
-    if( dlimit > msize && dsize > dlimit ) return false;
-    return true;
-    }
-  };
-
-
-const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
-const char * const bad_dict_msg = "Invalid dictionary size in member header.";
-const char * const corrupt_mm_msg = "Corrupt header in multimember file.";
-const char * const trailing_msg = "Trailing data not allowed.";
-
-// defined in extract.cc
-int readblock( const int fd, uint8_t * const buf, const int size );
-int writeblock( const int fd, const uint8_t * const buf, const int size );
--- a/lzip_index.cc
+++ b/lzip_index.cc
@ -26,8 +26,8 @@
 #include <stdint.h>
 #include <unistd.h>

-#include "lzip.h"
 #include "lzip_index.h"
+#include "tarlz.h"


 namespace {
--- a/main.cc
+++ b/main.cc
@ -87,6 +87,7 @@ void show_help( const long num_online )
               "  -h, --help                 display this help and exit\n"
               "  -V, --version              output version information and exit\n"
               "  -A, --concatenate          append tar.lz archives to the end of an archive\n"
+               "  -B, --data-size=<bytes>    set target size of input data blocks [2x8=16 MiB]\n"
               "  -c, --create               create a new archive\n"
               "  -C, --directory=<dir>      change to directory <dir>\n"
               "  -f, --file=<archive>       use archive file <archive>\n"
@ -98,6 +99,7 @@ void show_help( const long num_online )
               "  -x, --extract              extract files from an archive\n"
               "  -0 .. -9                   set compression level [default 6]\n"
               "      --asolid               create solidly compressed appendable archive\n"
+               "      --bsolid               create per-data-block compressed archive\n"
               "      --dsolid               create per-directory compressed archive\n"
               "      --no-solid             create per-file compressed archive (default)\n"
               "      --solid                create solidly compressed archive\n"
@ -284,8 +286,8 @@ int main( const int argc, const char * const argv[] )
    { show_error( "Bad library version. At least lzlib 1.0 is required." );
      return 1; }

-  enum { opt_ano = 256, opt_aso, opt_crc, opt_dbg, opt_dso, opt_grp, opt_kd,
-         opt_nso, opt_own, opt_per, opt_sol, opt_un };
+  enum { opt_ano = 256, opt_aso, opt_bso, opt_crc, opt_dbg, opt_dso, opt_grp,
+         opt_kd, opt_nso, opt_own, opt_per, opt_sol, opt_un };
  const Arg_parser::Option options[] =
    {
    { '0',  0,                 Arg_parser::no  },
@ -299,6 +301,7 @@ int main( const int argc, const char * const argv[] )
    { '8',  0,                 Arg_parser::no  },
    { '9',  0,                 Arg_parser::no  },
    { 'A', "concatenate",      Arg_parser::no  },
+    { 'B', "data-size",        Arg_parser::yes },
    { 'c', "create",           Arg_parser::no  },
    { 'C', "directory",        Arg_parser::yes },
    { 'f', "file",             Arg_parser::yes },
@ -313,6 +316,7 @@ int main( const int argc, const char * const argv[] )
    { 'x', "extract",          Arg_parser::no  },
    { opt_ano, "anonymous",    Arg_parser::no  },
    { opt_aso, "asolid",       Arg_parser::no  },
+    { opt_bso, "bsolid",       Arg_parser::no  },
    { opt_dbg, "debug",        Arg_parser::yes },
    { opt_dso, "dsolid",       Arg_parser::no  },
    { opt_grp, "group",        Arg_parser::yes },
@ -347,6 +351,8 @@ int main( const int argc, const char * const argv[] )
      case '5': case '6': case '7': case '8': case '9':
                level = code - '0'; break;
      case 'A': set_mode( program_mode, m_concatenate ); break;
+      case 'B': cl_data_size = getnum( arg, min_data_size, max_data_size );
+                break;
      case 'c': set_mode( program_mode, m_create ); break;
      case 'C': break;					// skip chdir
      case 'f': if( sarg != "-" ) archive_name = sarg; break;
@ -361,6 +367,7 @@ int main( const int argc, const char * const argv[] )
      case 'x': set_mode( program_mode, m_extract ); break;
      case opt_ano: set_owner( "root" ); set_group( "root" ); break;
      case opt_aso: solidity = asolid; break;
+      case opt_bso: solidity = bsolid; break;
      case opt_crc: missing_crc = true; break;
      case opt_dbg: debug_level = getnum( arg, 0, 3 ); break;
      case opt_dso: solidity = dsolid; break;
--- a/tarlz.h
+++ b/tarlz.h
@ -42,22 +42,195 @@ inline bool verify_ustar_magic( const uint8_t * const header )
  { return std::memcmp( header + magic_o, ustar_magic, magic_l ) == 0; }


-class CRC32C			// Uses CRC32-C (Castagnoli) polynomial.
+// Round "size" to the next multiple of header size (512).
+//
+inline unsigned long long round_up( const unsigned long long size )
+  {
+  const int rem = size % header_size;
+  const int padding = rem ? header_size - rem : 0;
+  return size + padding;
+  }
+
+
+class Extended		// stores metadata from/for extended records
+  {
+  std::string linkpath_;
+  std::string path_;
+  unsigned long long file_size_;
+
+  mutable long long full_size_;		// cached sizes
+  mutable int recsize_linkpath_;
+  mutable int recsize_path_;
+  mutable int recsize_file_size_;
+
+  bool crc_present_;		// true if CRC present in parsed records
+
+public:
+  static const std::string crc_record;
+
+  Extended()
+    : file_size_( 0 ), full_size_( -1 ), recsize_linkpath_( -1 ),
+      recsize_path_( -1 ), recsize_file_size_( -1 ), crc_present_( false ) {}
+
+  void reset()
+    { linkpath_.clear(); path_.clear(); file_size_ = 0; full_size_ = -1;
+      recsize_linkpath_ = -1; recsize_path_ = -1; recsize_file_size_ = -1;
+      crc_present_ = false; }
+
+  bool empty() const
+    { return linkpath_.empty() && path_.empty() && file_size_ == 0; }
+
+  const std::string & linkpath() const { return linkpath_; }
+  const std::string & path() const { return path_; }
+  unsigned long long file_size() const { return file_size_; }
+
+  void linkpath( const char * const lp )
+    { linkpath_ = lp; full_size_ = -1; recsize_linkpath_ = -1; }
+  void path( const char * const p )
+    { path_ = p; full_size_ = -1; recsize_path_ = -1; }
+  void file_size( const unsigned long long fs )
+    { file_size_ = fs; full_size_ = -1; recsize_file_size_ = -1; }
+
+  int recsize_linkpath() const;
+  int recsize_path() const;
+  int recsize_file_size() const;
+  unsigned long long edsize() const		// extended data size
+    { return empty() ? 0 : recsize_linkpath() + recsize_path() +
+                           recsize_file_size() + crc_record.size(); }
+  unsigned long long edsize_pad() const		// edsize rounded up
+    { return round_up( edsize() ); }
+  unsigned long long full_size() const
+    { if( full_size_ < 0 )
+        full_size_ = ( empty() ? 0 : header_size + edsize_pad() );
+      return full_size_; }
+
+  bool crc_present() const { return crc_present_; }
+  bool parse( const char * const buf, const unsigned long long edsize,
+              const bool permissive );
+  };
+
+
+enum {
+  min_dictionary_bits = 12,
+  min_dictionary_size = 1 << min_dictionary_bits,
+  max_dictionary_bits = 29,
+  max_dictionary_size = 1 << max_dictionary_bits,
+  min_member_size = 36,
+  min_data_size = 2 * min_dictionary_size,
+  max_data_size = 2 * max_dictionary_size };
+
+
+inline bool isvalid_ds( const unsigned dictionary_size )
+  { return ( dictionary_size >= min_dictionary_size &&
+             dictionary_size <= max_dictionary_size ); }
+
+
+const uint8_t lzip_magic[4] = { 0x4C, 0x5A, 0x49, 0x50 };	// "LZIP"
+
+struct Lzip_header
+  {
+  uint8_t data[6];			// 0-3 magic bytes
+					//   4 version
+					//   5 coded_dict_size
+  enum { size = 6 };
+
+  bool verify_magic() const
+    { return ( std::memcmp( data, lzip_magic, 4 ) == 0 ); }
+
+  bool verify_prefix( const int sz ) const	// detect (truncated) header
+    {
+    for( int i = 0; i < sz && i < 4; ++i )
+      if( data[i] != lzip_magic[i] ) return false;
+    return ( sz > 0 );
+    }
+  bool verify_corrupt() const			// detect corrupt header
+    {
+    int matches = 0;
+    for( int i = 0; i < 4; ++i )
+      if( data[i] == lzip_magic[i] ) ++matches;
+    return ( matches > 1 && matches < 4 );
+    }
+
+  uint8_t version() const { return data[4]; }
+  bool verify_version() const { return ( data[4] == 1 ); }
+
+  unsigned dictionary_size() const
+    {
+    unsigned sz = ( 1 << ( data[5] & 0x1F ) );
+    if( sz > min_dictionary_size )
+      sz -= ( sz / 16 ) * ( ( data[5] >> 5 ) & 7 );
+    return sz;
+    }
+  };
+
+
+struct Lzip_trailer
+  {
+  uint8_t data[20];	//  0-3  CRC32 of the uncompressed data
+			//  4-11 size of the uncompressed data
+			// 12-19 member size including header and trailer
+  enum { size = 20 };
+
+  unsigned data_crc() const
+    {
+    unsigned tmp = 0;
+    for( int i = 3; i >= 0; --i ) { tmp <<= 8; tmp += data[i]; }
+    return tmp;
+    }
+
+  unsigned long long data_size() const
+    {
+    unsigned long long tmp = 0;
+    for( int i = 11; i >= 4; --i ) { tmp <<= 8; tmp += data[i]; }
+    return tmp;
+    }
+
+  unsigned long long member_size() const
+    {
+    unsigned long long tmp = 0;
+    for( int i = 19; i >= 12; --i ) { tmp <<= 8; tmp += data[i]; }
+    return tmp;
+    }
+
+  bool verify_consistency() const	// check internal consistency
+    {
+    const unsigned crc = data_crc();
+    const unsigned long long dsize = data_size();
+    if( ( crc == 0 ) != ( dsize == 0 ) ) return false;
+    const unsigned long long msize = member_size();
+    if( msize < min_member_size ) return false;
+    const unsigned long long mlimit = ( 9 * dsize + 7 ) / 8 + min_member_size;
+    if( mlimit > dsize && msize > mlimit ) return false;
+    const unsigned long long dlimit = 7090 * ( msize - 26 ) - 1;
+    if( dlimit > msize && dsize > dlimit ) return false;
+    return true;
+    }
+  };
+
+
+class CRC32
  {
  uint32_t data[256];		// Table of CRCs of all 8-bit messages.

 public:
-  CRC32C()
+  CRC32( const bool castagnoli = false )
    {
+    const unsigned cpol = 0x82F63B78U;	// CRC32-C  Castagnoli polynomial.
+    const unsigned ipol = 0xEDB88320U;	// IEEE 802.3 Ethernet polynomial.
+    const unsigned poly = castagnoli ? cpol : ipol;
+
    for( unsigned n = 0; n < 256; ++n )
      {
      unsigned c = n;
      for( int k = 0; k < 8; ++k )
-        { if( c & 1 ) c = 0x82F63B78U ^ ( c >> 1 ); else c >>= 1; }
+        { if( c & 1 ) c = poly ^ ( c >> 1 ); else c >>= 1; }
      data[n] = c;
      }
    }

+  void update_byte( uint32_t & crc, const uint8_t byte ) const
+    { crc = data[(crc^byte)&0xFF] ^ ( crc >> 8 ); }
+
  void update_buf( uint32_t & crc, const uint8_t * const buffer,
                   const int size ) const
    {
@ -78,32 +251,7 @@ public:
    }
  };

-extern const CRC32C crc32c;
-
-
-// Round "size" to the next multiple of header size (512).
-//
-inline unsigned long long round_up( unsigned long long size )
-  {
-  const int rem = size % header_size;
-  const int padding = rem ? header_size - rem : 0;
-  return size + padding;
-  }
-
-
-struct Extended		// stores metadata from/for extended records
-  {
-  std::string linkpath;
-  std::string path;
-  unsigned long long size;
-  bool crc_present;
-  Extended() : size( 0 ), crc_present( false ) {}
-  void reset()
-    { linkpath.clear(); path.clear(); size = 0; crc_present = false; }
-  bool empty() { return linkpath.empty() && path.empty() && size == 0; }
-  bool parse( const char * const buf, const unsigned long long edsize,
-              const bool permissive );
-  };
+extern const CRC32 crc32c;


 enum { initial_line_length = 1000 };	// must be >= 77
@ -132,10 +280,16 @@ public:
  unsigned size() const { return size_; }
  };

+const char * const bad_magic_msg = "Bad magic number (file not in lzip format).";
+const char * const bad_dict_msg = "Invalid dictionary size in member header.";
+const char * const corrupt_mm_msg = "Corrupt header in multimember file.";
+const char * const trailing_msg = "Trailing data not allowed.";
+
 // defined in create.cc
-enum Solidity { no_solid, dsolid, asolid, solid };
+enum Solidity { no_solid, bsolid, dsolid, asolid, solid };
 extern int cl_owner;
 extern int cl_group;
+extern int cl_data_size;
 extern Solidity solidity;
 unsigned ustar_chksum( const uint8_t * const header );
 bool verify_ustar_chksum( const uint8_t * const header );
@ -152,6 +306,8 @@ void format_member_name( const Extended & extended, const Tar_header header,
 const char * remove_leading_slash( const char * const filename );
 bool compare_prefix_dir( const char * const dir, const char * const name );
 bool compare_tslash( const char * const name1, const char * const name2 );
+int readblock( const int fd, uint8_t * const buf, const int size );
+int writeblock( const int fd, const uint8_t * const buf, const int size );
 unsigned long long parse_octal( const uint8_t * const ptr, const int size );
 int decode( const std::string & archive_name, const Arg_parser & parser,
            const int filenames, const int num_workers, const int debug_level,
--- a/testsuite/check.sh
+++ b/testsuite/check.sh
@ -65,7 +65,7 @@ lzlib_1_11() { [ ${lwarn} = 0 ] &&

 # Description of test files for tarlz:
 # test.txt.tar.lz:   1 member (test.txt).
-# t155.tar[.lz]:     directory + file + link + eof, all with 155 char names
+# t155.tar[.lz]:     directory + links + file + eof, all with 155 char names
 # tar_in_tlz1.tar.lz 2 members (test.txt.tar test3.tar) 3 lzip members
 # tar_in_tlz2.tar.lz 2 members (test.txt.tar test3.tar) 5 lzip members
 # test_bad1.tar.lz:  truncated at offset 6000 (of 7495)
@ -163,10 +163,11 @@ rm -f test.txt || framework_failure
 "${TARLZ}" -xf "${in_tar}" --missing-crc || test_failed $LINENO
 cmp "${in}" test.txt || test_failed $LINENO
 rm -f test.txt || framework_failure
-#
-printf "foo\n" > cfoo || framework_failure
-printf "bar\n" > cbar || framework_failure
-printf "baz\n" > cbaz || framework_failure
+
+# reference files for cmp
+cat "${testdir}"/rfoo > cfoo || framework_failure
+cat "${testdir}"/rbar > cbar || framework_failure
+cat "${testdir}"/rbaz > cbaz || framework_failure
 rm -f foo bar baz || framework_failure
 "${TARLZ}" -xf "${test3_lz}" --missing-crc || test_failed $LINENO
 cmp cfoo foo || test_failed $LINENO
@ -261,7 +262,7 @@ for i in "${tarint1_lz}" "${tarint2_lz}" ; do
 	cmp out0 out6 || test_failed $LINENO
 	cmp out2 out6 || test_failed $LINENO
 	cmp outv0 outv2 || test_failed $LINENO
-	cmp outv0 outv2 || test_failed $LINENO
+	cmp outv0 outv6 || test_failed $LINENO
 	cmp outv2 outv6 || test_failed $LINENO
 	rm -f out0 out2 out6 outv0 outv2 outv6 || framework_failure
 	"${TARLZ}" -xf "$i" || test_failed $LINENO
@ -409,14 +410,14 @@ cat cbar > bar || framework_failure
 cat cbaz > baz || framework_failure
 "${TARLZ}" --solid -0 -cf out.tar.lz foo || test_failed $LINENO
 cat out.tar.lz > aout.tar.lz || framework_failure
-for i in --asolid --dsolid --solid -0 ; do
+for i in --asolid --bsolid --dsolid --solid -0 ; do
 	"${TARLZ}" $i -q -rf out.tar.lz bar baz
 	[ $? = 2 ] || test_failed $LINENO $i
 	cmp out.tar.lz aout.tar.lz || test_failed $LINENO $i
 done
 rm -f out.tar.lz aout.tar.lz || framework_failure
-for i in --asolid --dsolid -0 ; do
-	for j in --asolid --dsolid --solid -0 ; do
+for i in --asolid --bsolid --dsolid -0 ; do
+	for j in --asolid --bsolid --dsolid --solid -0 ; do
 		"${TARLZ}" $i -0 -cf out.tar.lz foo ||
 			test_failed $LINENO "$i $j"
 		"${TARLZ}" $j -0 -rf out.tar.lz bar baz ||
--- a/testsuite/rbar
+++ b/testsuite/rbar
@ -0,0 +1 @@
+bar
--- a/testsuite/rbaz
+++ b/testsuite/rbaz
@ -0,0 +1 @@
+baz
--- a/testsuite/rfoo
+++ b/testsuite/rfoo
@ -0,0 +1 @@
+foo
--- a/testsuite/t155.tar
+++ b/testsuite/t155.tar
--- a/testsuite/t155.tar.lz
+++ b/testsuite/t155.tar.lz