1
0
Fork 0

Adding upstream version 1.7.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-17 20:40:42 +01:00
parent 0887b34103
commit da5ddefa70
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
10 changed files with 208 additions and 205 deletions

View file

@ -1,11 +1,6 @@
2015-05-23 Antonio Diaz Diaz <antonio@gnu.org> 2015-07-07 Antonio Diaz Diaz <antonio@gnu.org>
* Version 1.7-rc1 released. * Version 1.7 released.
* main.c (compress): Fixed spurious warning about uninitialized var.
2015-02-26 Antonio Diaz Diaz <antonio@gnu.org>
* Version 1.7-pre1 released.
* Ported fast encoder and option '-0' from lzip. * Ported fast encoder and option '-0' from lzip.
* Makefile.in: Added new targets 'install*-compress'. * Makefile.in: Added new targets 'install*-compress'.

9
README
View file

@ -45,6 +45,13 @@ each file without exceeding the given limit. Keep in mind that the
decompression memory requirement is affected at compression time by the decompression memory requirement is affected at compression time by the
choice of dictionary size limit. choice of dictionary size limit.
The amount of memory required for compression is about 1 or 2 times the
dictionary size limit (1 if input file size is less than dictionary size
limit, else 2) plus 9 times the dictionary size really used. The option
'-0' is special and only requires about 1.5 MiB at most. The amount of
memory required for decompression is about 46 kB larger than the
dictionary size really used.
When compressing, clzip replaces every file given in the command line When compressing, clzip replaces every file given in the command line
with a compressed version of itself, with the name "original_name.lz". with a compressed version of itself, with the name "original_name.lz".
When decompressing, clzip attempts to guess the name for the decompressed When decompressing, clzip attempts to guess the name for the decompressed
@ -93,7 +100,7 @@ used by lzip could be developed, and the resulting sequence could also
be coded using the LZMA coding scheme. be coded using the LZMA coding scheme.
Clzip currently implements two variants of the LZMA algorithm; fast Clzip currently implements two variants of the LZMA algorithm; fast
(used by option -0) and normal (used by all other compression levels). (used by option '-0') and normal (used by all other compression levels).
The high compression of LZMA comes from combining two basic, well-proven The high compression of LZMA comes from combining two basic, well-proven
compression ideas: sliding dictionaries (LZ77/78) and markov models (the compression ideas: sliding dictionaries (LZ77/78) and markov models (the

2
configure vendored
View file

@ -6,7 +6,7 @@
# to copy, distribute and modify it. # to copy, distribute and modify it.
pkgname=clzip pkgname=clzip
pkgversion=1.7-rc1 pkgversion=1.7
progname=clzip progname=clzip
srctrigger=doc/${pkgname}.texi srctrigger=doc/${pkgname}.texi

View file

@ -38,11 +38,10 @@ void Pp_show_msg( struct Pretty_print * const pp, const char * const msg )
{ {
if( pp->first_post ) if( pp->first_post )
{ {
int i, len; int i, len = pp->longest_name - strlen( pp->name );
pp->first_post = false; pp->first_post = false;
fprintf( stderr, " %s: ", pp->name ); fprintf( stderr, " %s: ", pp->name );
len = pp->longest_name - strlen( pp->name ); for( i = 0; i < len; ++i ) fputc( ' ', stderr );
for( i = 0; i < len; ++i ) fprintf( stderr, " " );
if( !msg ) fflush( stderr ); if( !msg ) fflush( stderr );
} }
if( msg ) fprintf( stderr, "%s\n", msg ); if( msg ) fprintf( stderr, "%s\n", msg );
@ -153,7 +152,7 @@ static bool LZd_verify_trailer( struct LZ_decoder * const d,
if( verbosity >= 0 ) if( verbosity >= 0 )
{ {
Pp_show_msg( pp, 0 ); Pp_show_msg( pp, 0 );
fprintf( stderr, "CRC mismatch; trailer says %08X, data CRC is %08X.\n", fprintf( stderr, "CRC mismatch; trailer says %08X, data CRC is %08X\n",
trailer_crc, LZd_crc( d ) ); trailer_crc, LZd_crc( d ) );
} }
} }
@ -164,7 +163,7 @@ static bool LZd_verify_trailer( struct LZ_decoder * const d,
if( verbosity >= 0 ) if( verbosity >= 0 )
{ {
Pp_show_msg( pp, 0 ); Pp_show_msg( pp, 0 );
fprintf( stderr, "Data size mismatch; trailer says %llu, data size is %llu (0x%llX).\n", fprintf( stderr, "Data size mismatch; trailer says %llu, data size is %llu (0x%llX)\n",
trailer_data_size, LZd_data_position( d ), LZd_data_position( d ) ); trailer_data_size, LZd_data_position( d ), LZd_data_position( d ) );
} }
} }
@ -175,7 +174,7 @@ static bool LZd_verify_trailer( struct LZ_decoder * const d,
if( verbosity >= 0 ) if( verbosity >= 0 )
{ {
Pp_show_msg( pp, 0 ); Pp_show_msg( pp, 0 );
fprintf( stderr, "Member size mismatch; trailer says %llu, member size is %llu (0x%llX).\n", fprintf( stderr, "Member size mismatch; trailer says %llu, member size is %llu (0x%llX)\n",
trailer_member_size, member_size, member_size ); trailer_member_size, member_size, member_size );
} }
} }
@ -224,7 +223,7 @@ int LZd_decode_member( struct LZ_decoder * const d,
LZd_peek( d, rep0 ) ) ); LZd_peek( d, rep0 ) ) );
} }
} }
else else /* match or repeated match */
{ {
int len; int len;
if( Rd_decode_bit( rdec, &d->bm_rep[state] ) != 0 ) /* 2nd bit */ if( Rd_decode_bit( rdec, &d->bm_rep[state] ) != 0 ) /* 2nd bit */
@ -254,7 +253,7 @@ int LZd_decode_member( struct LZ_decoder * const d,
state = St_set_rep( state ); state = St_set_rep( state );
len = min_match_len + Rd_decode_len( rdec, &d->rep_len_model, pos_state ); len = min_match_len + Rd_decode_len( rdec, &d->rep_len_model, pos_state );
} }
else else /* match */
{ {
int dis_slot; int dis_slot;
const unsigned rep0_saved = rep0; const unsigned rep0_saved = rep0;
@ -288,7 +287,7 @@ int LZd_decode_member( struct LZ_decoder * const d,
if( verbosity >= 0 ) if( verbosity >= 0 )
{ {
Pp_show_msg( pp, 0 ); Pp_show_msg( pp, 0 );
fprintf( stderr, "Unsupported marker code '%d'.\n", len ); fprintf( stderr, "Unsupported marker code '%d'\n", len );
} }
return 4; return 4;
} }

View file

@ -1,5 +1,5 @@
.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1. .\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.46.1.
.TH CLZIP "1" "May 2015" "clzip 1.7-rc1" "User Commands" .TH CLZIP "1" "July 2015" "clzip 1.7" "User Commands"
.SH NAME .SH NAME
clzip \- reduces the size of files clzip \- reduces the size of files
.SH SYNOPSIS .SH SYNOPSIS
@ -28,7 +28,7 @@ decompress
overwrite existing output files overwrite existing output files
.TP .TP
\fB\-F\fR, \fB\-\-recompress\fR \fB\-F\fR, \fB\-\-recompress\fR
force recompression of compressed files force re\-compression of compressed files
.TP .TP
\fB\-k\fR, \fB\-\-keep\fR \fB\-k\fR, \fB\-\-keep\fR
keep (don't delete) input files keep (don't delete) input files

View file

@ -11,14 +11,14 @@ File: clzip.info, Node: Top, Next: Introduction, Up: (dir)
Clzip Manual Clzip Manual
************ ************
This manual is for Clzip (version 1.7-rc1, 23 May 2015). This manual is for Clzip (version 1.7, 7 July 2015).
* Menu: * Menu:
* Introduction:: Purpose and features of clzip * Introduction:: Purpose and features of clzip
* Algorithm:: How clzip compresses the data
* Invoking clzip:: Command line interface * Invoking clzip:: Command line interface
* File format:: Detailed format of the compressed file * File format:: Detailed format of the compressed file
* Algorithm:: How clzip compresses the data
* Examples:: A small tutorial with examples * Examples:: A small tutorial with examples
* Problems:: Reporting bugs * Problems:: Reporting bugs
* Concept index:: Index of concepts * Concept index:: Index of concepts
@ -30,7 +30,7 @@ This manual is for Clzip (version 1.7-rc1, 23 May 2015).
copy, distribute and modify it. copy, distribute and modify it.
 
File: clzip.info, Node: Introduction, Next: Algorithm, Prev: Top, Up: Top File: clzip.info, Node: Introduction, Next: Invoking clzip, Prev: Top, Up: Top
1 Introduction 1 Introduction
************** **************
@ -53,7 +53,8 @@ availability:
recovery means. The lziprecover program can repair bit-flip errors recovery means. The lziprecover program can repair bit-flip errors
(one of the most common forms of data corruption) in lzip files, (one of the most common forms of data corruption) in lzip files,
and provides data recovery capabilities, including error-checked and provides data recovery capabilities, including error-checked
merging of damaged copies of a file. merging of damaged copies of a file. *note Data safety:
(lziprecover)Data safety.
* The lzip format is as simple as possible (but not simpler). The * The lzip format is as simple as possible (but not simpler). The
lzip manual provides the code of a simple decompressor along with lzip manual provides the code of a simple decompressor along with
@ -87,6 +88,11 @@ bzip2, which makes it safer than compressors returning ambiguous warning
values (like gzip) when it is used as a back end for other programs like values (like gzip) when it is used as a back end for other programs like
tar or zutils. tar or zutils.
Clzip will automatically use the smallest possible dictionary size
for each file without exceeding the given limit. Keep in mind that the
decompression memory requirement is affected at compression time by the
choice of dictionary size limit.
The amount of memory required for compression is about 1 or 2 times The amount of memory required for compression is about 1 or 2 times
the dictionary size limit (1 if input file size is less than dictionary the dictionary size limit (1 if input file size is less than dictionary
size limit, else 2) plus 9 times the dictionary size really used. The size limit, else 2) plus 9 times the dictionary size really used. The
@ -94,11 +100,6 @@ option '-0' is special and only requires about 1.5 MiB at most. The
amount of memory required for decompression is about 46 kB larger than amount of memory required for decompression is about 46 kB larger than
the dictionary size really used. the dictionary size really used.
Clzip will automatically use the smallest possible dictionary size
for each file without exceeding the given limit. Keep in mind that the
decompression memory requirement is affected at compression time by the
choice of dictionary size limit.
When compressing, clzip replaces every file given in the command line When compressing, clzip replaces every file given in the command line
with a compressed version of itself, with the name "original_name.lz". with a compressed version of itself, with the name "original_name.lz".
When decompressing, clzip attempts to guess the name for the When decompressing, clzip attempts to guess the name for the
@ -138,75 +139,9 @@ automatically creating multi-member output. The members so created are
large, about 2 PiB each. large, about 2 PiB each.
 
File: clzip.info, Node: Algorithm, Next: Invoking clzip, Prev: Introduction, Up: Top File: clzip.info, Node: Invoking clzip, Next: File format, Prev: Introduction, Up: Top
2 Algorithm 2 Invoking clzip
***********
In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a
concrete algorithm; it is more like "any algorithm using the LZMA coding
scheme". For example, the option '-0' of lzip uses the scheme in almost
the simplest way possible; issuing the longest match it can find, or a
literal byte if it can't find a match. Inversely, a much more elaborated
way of finding coding sequences of minimum size than the one currently
used by lzip could be developed, and the resulting sequence could also
be coded using the LZMA coding scheme.
Clzip currently implements two variants of the LZMA algorithm; fast
(used by option -0) and normal (used by all other compression levels).
The high compression of LZMA comes from combining two basic,
well-proven compression ideas: sliding dictionaries (LZ77/78) and
markov models (the thing used by every compression algorithm that uses
a range encoder or similar order-0 entropy coder as its last stage)
with segregation of contexts according to what the bits are used for.
Clzip is a two stage compressor. The first stage is a Lempel-Ziv
coder, which reduces redundancy by translating chunks of data to their
corresponding distance-length pairs. The second stage is a range encoder
that uses a different probability model for each type of data;
distances, lengths, literal bytes, etc.
Here is how it works, step by step:
1) The member header is written to the output stream.
2) The first byte is coded literally, because there are no previous
bytes to which the match finder can refer to.
3) The main encoder advances to the next byte in the input data and
calls the match finder.
4) The match finder fills an array with the minimum distances before
the current byte where a match of a given length can be found.
5) Go back to step 3 until a sequence (formed of pairs, repeated
distances and literal bytes) of minimum price has been formed. Where the
price represents the number of output bits produced.
6) The range encoder encodes the sequence produced by the main
encoder and sends the produced bytes to the output stream.
7) Go back to step 3 until the input data are finished or until the
member or volume size limits are reached.
8) The range encoder is flushed.
9) The member trailer is written to the output stream.
10) If there are more data to compress, go back to step 1.
The ideas embodied in clzip are due to (at least) the following people:
Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for
the definition of Markov chains), G.N.N. Martin (for the definition of
range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).

File: clzip.info, Node: Invoking clzip, Next: File format, Prev: Algorithm, Up: Top
3 Invoking clzip
**************** ****************
The format for running clzip is: The format for running clzip is:
@ -246,7 +181,7 @@ The format for running clzip is:
'-F' '-F'
'--recompress' '--recompress'
Force recompression of files whose name already has the '.lz' or Force re-compression of files whose name already has the '.lz' or
'.tlz' suffix. '.tlz' suffix.
'-k' '-k'
@ -363,9 +298,9 @@ invalid input file, 3 for an internal consistency error (eg, bug) which
caused clzip to panic. caused clzip to panic.
 
File: clzip.info, Node: File format, Next: Examples, Prev: Invoking clzip, Up: Top File: clzip.info, Node: File format, Next: Algorithm, Prev: Invoking clzip, Up: Top
4 File format 3 File format
************* *************
Perfection is reached, not when there is no longer anything to add, but Perfection is reached, not when there is no longer anything to add, but
@ -434,7 +369,73 @@ additional information before, between, or after them.
 
File: clzip.info, Node: Examples, Next: Problems, Prev: File format, Up: Top File: clzip.info, Node: Algorithm, Next: Examples, Prev: File format, Up: Top
4 Algorithm
***********
In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a
concrete algorithm; it is more like "any algorithm using the LZMA coding
scheme". For example, the option '-0' of lzip uses the scheme in almost
the simplest way possible; issuing the longest match it can find, or a
literal byte if it can't find a match. Inversely, a much more elaborated
way of finding coding sequences of minimum size than the one currently
used by lzip could be developed, and the resulting sequence could also
be coded using the LZMA coding scheme.
Clzip currently implements two variants of the LZMA algorithm; fast
(used by option '-0') and normal (used by all other compression levels).
The high compression of LZMA comes from combining two basic,
well-proven compression ideas: sliding dictionaries (LZ77/78) and
markov models (the thing used by every compression algorithm that uses
a range encoder or similar order-0 entropy coder as its last stage)
with segregation of contexts according to what the bits are used for.
Clzip is a two stage compressor. The first stage is a Lempel-Ziv
coder, which reduces redundancy by translating chunks of data to their
corresponding distance-length pairs. The second stage is a range encoder
that uses a different probability model for each type of data;
distances, lengths, literal bytes, etc.
Here is how it works, step by step:
1) The member header is written to the output stream.
2) The first byte is coded literally, because there are no previous
bytes to which the match finder can refer to.
3) The main encoder advances to the next byte in the input data and
calls the match finder.
4) The match finder fills an array with the minimum distances before
the current byte where a match of a given length can be found.
5) Go back to step 3 until a sequence (formed of pairs, repeated
distances and literal bytes) of minimum price has been formed. Where the
price represents the number of output bits produced.
6) The range encoder encodes the sequence produced by the main
encoder and sends the produced bytes to the output stream.
7) Go back to step 3 until the input data are finished or until the
member or volume size limits are reached.
8) The range encoder is flushed.
9) The member trailer is written to the output stream.
10) If there are more data to compress, go back to step 1.
The ideas embodied in clzip are due to (at least) the following people:
Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for
the definition of Markov chains), G.N.N. Martin (for the definition of
range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).

File: clzip.info, Node: Examples, Next: Problems, Prev: Algorithm, Up: Top
5 A small tutorial with examples 5 A small tutorial with examples
******************************** ********************************
@ -545,13 +546,13 @@ Concept index
 
Tag Table: Tag Table:
Node: Top210 Node: Top210
Node: Introduction897 Node: Introduction893
Node: Algorithm6100 Node: Invoking clzip6152
Node: Invoking clzip8930 Node: File format11705
Node: File format14479 Node: Algorithm14108
Node: Examples16881 Node: Examples16933
Node: Problems18850 Node: Problems18900
Node: Concept index19376 Node: Concept index19426
 
End Tag Table End Tag Table

View file

@ -6,8 +6,8 @@
@finalout @finalout
@c %**end of header @c %**end of header
@set UPDATED 23 May 2015 @set UPDATED 7 July 2015
@set VERSION 1.7-rc1 @set VERSION 1.7
@dircategory Data Compression @dircategory Data Compression
@direntry @direntry
@ -36,9 +36,9 @@ This manual is for Clzip (version @value{VERSION}, @value{UPDATED}).
@menu @menu
* Introduction:: Purpose and features of clzip * Introduction:: Purpose and features of clzip
* Algorithm:: How clzip compresses the data
* Invoking clzip:: Command line interface * Invoking clzip:: Command line interface
* File format:: Detailed format of the compressed file * File format:: Detailed format of the compressed file
* Algorithm:: How clzip compresses the data
* Examples:: A small tutorial with examples * Examples:: A small tutorial with examples
* Problems:: Reporting bugs * Problems:: Reporting bugs
* Concept index:: Index of concepts * Concept index:: Index of concepts
@ -72,10 +72,14 @@ availability:
@itemize @bullet @itemize @bullet
@item @item
The lzip format provides very safe integrity checking and some data The lzip format provides very safe integrity checking and some data
recovery means. The lziprecover program can repair bit-flip errors (one recovery means. The
of the most common forms of data corruption) in lzip files, and provides @uref{http://www.nongnu.org/lzip/manual/lziprecover_manual.html#Data-safety,,lziprecover}
data recovery capabilities, including error-checked merging of damaged program can repair bit-flip errors (one of the most common forms of data
copies of a file. corruption) in lzip files, and provides data recovery capabilities,
including error-checked merging of damaged copies of a file.
@ifnothtml
@ref{Data safety,,,lziprecover}.
@end ifnothtml
@item @item
The lzip format is as simple as possible (but not simpler). The lzip The lzip format is as simple as possible (but not simpler). The lzip
@ -111,6 +115,11 @@ bzip2, which makes it safer than compressors returning ambiguous warning
values (like gzip) when it is used as a back end for other programs like values (like gzip) when it is used as a back end for other programs like
tar or zutils. tar or zutils.
Clzip will automatically use the smallest possible dictionary size for
each file without exceeding the given limit. Keep in mind that the
decompression memory requirement is affected at compression time by the
choice of dictionary size limit.
The amount of memory required for compression is about 1 or 2 times the The amount of memory required for compression is about 1 or 2 times the
dictionary size limit (1 if input file size is less than dictionary size dictionary size limit (1 if input file size is less than dictionary size
limit, else 2) plus 9 times the dictionary size really used. The option limit, else 2) plus 9 times the dictionary size really used. The option
@ -118,11 +127,6 @@ limit, else 2) plus 9 times the dictionary size really used. The option
of memory required for decompression is about 46 kB larger than the of memory required for decompression is about 46 kB larger than the
dictionary size really used. dictionary size really used.
Clzip will automatically use the smallest possible dictionary size for
each file without exceeding the given limit. Keep in mind that the
decompression memory requirement is affected at compression time by the
choice of dictionary size limit.
When compressing, clzip replaces every file given in the command line When compressing, clzip replaces every file given in the command line
with a compressed version of itself, with the name "original_name.lz". with a compressed version of itself, with the name "original_name.lz".
When decompressing, clzip attempts to guess the name for the decompressed When decompressing, clzip attempts to guess the name for the decompressed
@ -164,72 +168,6 @@ automatically creating multi-member output. The members so created are
large, about 2 PiB each. large, about 2 PiB each.
@node Algorithm
@chapter Algorithm
@cindex algorithm
In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a
concrete algorithm; it is more like "any algorithm using the LZMA coding
scheme". For example, the option '-0' of lzip uses the scheme in almost
the simplest way possible; issuing the longest match it can find, or a
literal byte if it can't find a match. Inversely, a much more elaborated
way of finding coding sequences of minimum size than the one currently
used by lzip could be developed, and the resulting sequence could also
be coded using the LZMA coding scheme.
Clzip currently implements two variants of the LZMA algorithm; fast
(used by option -0) and normal (used by all other compression levels).
The high compression of LZMA comes from combining two basic, well-proven
compression ideas: sliding dictionaries (LZ77/78) and markov models (the
thing used by every compression algorithm that uses a range encoder or
similar order-0 entropy coder as its last stage) with segregation of
contexts according to what the bits are used for.
Clzip is a two stage compressor. The first stage is a Lempel-Ziv coder,
which reduces redundancy by translating chunks of data to their
corresponding distance-length pairs. The second stage is a range encoder
that uses a different probability model for each type of data;
distances, lengths, literal bytes, etc.
Here is how it works, step by step:
1) The member header is written to the output stream.
2) The first byte is coded literally, because there are no previous
bytes to which the match finder can refer to.
3) The main encoder advances to the next byte in the input data and
calls the match finder.
4) The match finder fills an array with the minimum distances before the
current byte where a match of a given length can be found.
5) Go back to step 3 until a sequence (formed of pairs, repeated
distances and literal bytes) of minimum price has been formed. Where the
price represents the number of output bits produced.
6) The range encoder encodes the sequence produced by the main encoder
and sends the produced bytes to the output stream.
7) Go back to step 3 until the input data are finished or until the
member or volume size limits are reached.
8) The range encoder is flushed.
9) The member trailer is written to the output stream.
10) If there are more data to compress, go back to step 1.
@sp 1
@noindent
The ideas embodied in clzip are due to (at least) the following people:
Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for
the definition of Markov chains), G.N.N. Martin (for the definition of
range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).
@node Invoking clzip @node Invoking clzip
@chapter Invoking clzip @chapter Invoking clzip
@cindex invoking @cindex invoking
@ -276,7 +214,7 @@ Force overwrite of output files.
@item -F @item -F
@itemx --recompress @itemx --recompress
Force recompression of files whose name already has the @samp{.lz} or Force re-compression of files whose name already has the @samp{.lz} or
@samp{.tlz} suffix. @samp{.tlz} suffix.
@item -k @item -k
@ -476,6 +414,72 @@ facilitates safe recovery of undamaged members from multi-member files.
@end table @end table
@node Algorithm
@chapter Algorithm
@cindex algorithm
In spite of its name (Lempel-Ziv-Markov chain-Algorithm), LZMA is not a
concrete algorithm; it is more like "any algorithm using the LZMA coding
scheme". For example, the option @samp{-0} of lzip uses the scheme in almost
the simplest way possible; issuing the longest match it can find, or a
literal byte if it can't find a match. Inversely, a much more elaborated
way of finding coding sequences of minimum size than the one currently
used by lzip could be developed, and the resulting sequence could also
be coded using the LZMA coding scheme.
Clzip currently implements two variants of the LZMA algorithm; fast
(used by option @samp{-0}) and normal (used by all other compression levels).
The high compression of LZMA comes from combining two basic, well-proven
compression ideas: sliding dictionaries (LZ77/78) and markov models (the
thing used by every compression algorithm that uses a range encoder or
similar order-0 entropy coder as its last stage) with segregation of
contexts according to what the bits are used for.
Clzip is a two stage compressor. The first stage is a Lempel-Ziv coder,
which reduces redundancy by translating chunks of data to their
corresponding distance-length pairs. The second stage is a range encoder
that uses a different probability model for each type of data;
distances, lengths, literal bytes, etc.
Here is how it works, step by step:
1) The member header is written to the output stream.
2) The first byte is coded literally, because there are no previous
bytes to which the match finder can refer to.
3) The main encoder advances to the next byte in the input data and
calls the match finder.
4) The match finder fills an array with the minimum distances before the
current byte where a match of a given length can be found.
5) Go back to step 3 until a sequence (formed of pairs, repeated
distances and literal bytes) of minimum price has been formed. Where the
price represents the number of output bits produced.
6) The range encoder encodes the sequence produced by the main encoder
and sends the produced bytes to the output stream.
7) Go back to step 3 until the input data are finished or until the
member or volume size limits are reached.
8) The range encoder is flushed.
9) The member trailer is written to the output stream.
10) If there are more data to compress, go back to step 1.
@sp 1
@noindent
The ideas embodied in clzip are due to (at least) the following people:
Abraham Lempel and Jacob Ziv (for the LZ algorithm), Andrey Markov (for
the definition of Markov chains), G.N.N. Martin (for the definition of
range encoding), Igor Pavlov (for putting all the above together in
LZMA), and Julian Seward (for bzip2's CLI).
@node Examples @node Examples
@chapter A small tutorial with examples @chapter A small tutorial with examples
@cindex examples @cindex examples

View file

@ -75,7 +75,8 @@ bool Mb_init( struct Matchfinder_base * const mb,
const int num_prev_positions23, const int num_prev_positions23,
const int pos_array_factor, const int ifd ) const int pos_array_factor, const int ifd )
{ {
const int buffer_size_limit = ( dict_factor * dict_size ) + before + after_size; const int buffer_size_limit =
( dict_factor * dict_size ) + before + after_size;
unsigned size; unsigned size;
int i; int i;

29
main.c
View file

@ -105,7 +105,7 @@ static void show_help( void )
" -c, --stdout send output to standard output\n" " -c, --stdout send output to standard output\n"
" -d, --decompress decompress\n" " -d, --decompress decompress\n"
" -f, --force overwrite existing output files\n" " -f, --force overwrite existing output files\n"
" -F, --recompress force recompression of compressed files\n" " -F, --recompress force re-compression of compressed files\n"
" -k, --keep keep (don't delete) input files\n" " -k, --keep keep (don't delete) input files\n"
" -m, --match-length=<bytes> set match length limit in bytes [36]\n" " -m, --match-length=<bytes> set match length limit in bytes [36]\n"
" -o, --output=<file> if reading stdin, place the output into <file>\n" " -o, --output=<file> if reading stdin, place the output into <file>\n"
@ -180,7 +180,7 @@ static unsigned long long getnum( const char * const ptr,
if( !errno && tail[0] ) if( !errno && tail[0] )
{ {
int factor = ( tail[1] == 'i' ) ? 1024 : 1000; const int factor = ( tail[1] == 'i' ) ? 1024 : 1000;
int exponent = 0, i; int exponent = 0, i;
bool bad_multiplier = false; bool bad_multiplier = false;
switch( tail[0] ) switch( tail[0] )
@ -264,7 +264,7 @@ static int open_instream( const char * const name, struct stat * const in_statsp
if( infd < 0 ) if( infd < 0 )
{ {
if( verbosity >= 0 ) if( verbosity >= 0 )
fprintf( stderr, "%s: Can't open input file '%s': %s.\n", fprintf( stderr, "%s: Can't open input file '%s': %s\n",
program_name, name, strerror( errno ) ); program_name, name, strerror( errno ) );
} }
else else
@ -281,7 +281,7 @@ static int open_instream( const char * const name, struct stat * const in_statsp
fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n", fprintf( stderr, "%s: Input file '%s' is not a regular file%s.\n",
program_name, name, program_name, name,
( can_read && !no_ofile ) ? ( can_read && !no_ofile ) ?
" and '--stdout' was not specified" : "" ); ",\n and '--stdout' was not specified" : "" );
close( infd ); close( infd );
infd = -1; infd = -1;
} }
@ -335,7 +335,7 @@ static void set_d_outname( const char * const name, const int i )
strcpy( output_filename, name ); strcpy( output_filename, name );
strcat( output_filename, ".out" ); strcat( output_filename, ".out" );
if( verbosity >= 1 ) if( verbosity >= 1 )
fprintf( stderr, "%s: Can't guess original name for '%s' -- using '%s'.\n", fprintf( stderr, "%s: Can't guess original name for '%s' -- using '%s'\n",
program_name, name, output_filename ); program_name, name, output_filename );
} }
@ -352,7 +352,7 @@ static bool open_outstream( const bool force )
fprintf( stderr, "%s: Output file '%s' already exists, skipping.\n", fprintf( stderr, "%s: Output file '%s' already exists, skipping.\n",
program_name, output_filename ); program_name, output_filename );
else else
fprintf( stderr, "%s: Can't create output file '%s': %s.\n", fprintf( stderr, "%s: Can't create output file '%s': %s\n",
program_name, output_filename, strerror( errno ) ); program_name, output_filename, strerror( errno ) );
} }
return ( outfd >= 0 ); return ( outfd >= 0 );
@ -519,7 +519,7 @@ static int compress( const unsigned long long member_size,
if( retval == 0 && verbosity >= 1 ) if( retval == 0 && verbosity >= 1 )
{ {
if( in_size == 0 || out_size == 0 ) if( in_size == 0 || out_size == 0 )
fprintf( stderr, " no data compressed.\n" ); fputs( " no data compressed.\n", stderr );
else else
fprintf( stderr, "%6.3f:1, %6.3f bits/byte, " fprintf( stderr, "%6.3f:1, %6.3f bits/byte, "
"%5.2f%% saved, %llu in, %llu out.\n", "%5.2f%% saved, %llu in, %llu out.\n",
@ -598,20 +598,17 @@ static int decompress( const int infd, struct Pretty_print * const pp,
if( verbosity >= 0 && result <= 2 ) if( verbosity >= 0 && result <= 2 )
{ {
Pp_show_msg( pp, 0 ); Pp_show_msg( pp, 0 );
if( result == 2 ) fprintf( stderr, "%s at pos %llu\n", ( result == 2 ) ?
fprintf( stderr, "File ends unexpectedly at pos %llu.\n", "File ends unexpectedly" : "Decoder error", partial_file_pos );
partial_file_pos );
else
fprintf( stderr, "Decoder error at pos %llu.\n", partial_file_pos );
} }
retval = 2; break; retval = 2; break;
} }
if( verbosity >= 2 ) if( verbosity >= 2 )
{ fprintf( stderr, testing ? "ok\n" : "done\n" ); Pp_reset( pp ); } { fputs( testing ? "ok\n" : "done\n", stderr ); Pp_reset( pp ); }
} }
Rd_free( &rdec ); Rd_free( &rdec );
if( verbosity == 1 && retval == 0 ) if( verbosity == 1 && retval == 0 )
fprintf( stderr, testing ? "ok\n" : "done\n" ); fputs( testing ? "ok\n" : "done\n", stderr );
return retval; return retval;
} }
@ -639,8 +636,8 @@ void show_error( const char * const msg, const int errcode, const bool help )
if( msg && msg[0] ) if( msg && msg[0] )
{ {
fprintf( stderr, "%s: %s", program_name, msg ); fprintf( stderr, "%s: %s", program_name, msg );
if( errcode > 0 ) fprintf( stderr, ": %s.", strerror( errcode ) ); if( errcode > 0 ) fprintf( stderr, ": %s", strerror( errcode ) );
fprintf( stderr, "\n" ); fputc( '\n', stderr );
} }
if( help ) if( help )
fprintf( stderr, "Try '%s --help' for more information.\n", fprintf( stderr, "Try '%s --help' for more information.\n",

View file

@ -79,7 +79,6 @@ printf .
cat in in > in2 || framework_failure cat in in > in2 || framework_failure
"${LZIP}" -o copy2 < in2 || fail=1 "${LZIP}" -o copy2 < in2 || fail=1
"${LZIP}" -t copy2.lz || fail=1 "${LZIP}" -t copy2.lz || fail=1
printf .
"${LZIP}" -cd copy2.lz > copy2 || fail=1 "${LZIP}" -cd copy2.lz > copy2 || fail=1
cmp in2 copy2 || fail=1 cmp in2 copy2 || fail=1
printf . printf .