488 lines
14 KiB
C++
488 lines
14 KiB
C++
/* Lzd - Educational decompressor for the lzip format
|
|
Copyright (C) 2013-2024 Antonio Diaz Diaz.
|
|
|
|
This program is free software. Redistribution and use in source and
|
|
binary forms, with or without modification, are permitted provided
|
|
that the following conditions are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions, and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions, and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
*/
|
|
/*
|
|
Exit status: 0 for a normal exit, 1 for environmental problems
|
|
(file not found, invalid command-line options, I/O errors, etc), 2 to
|
|
indicate a corrupt or invalid input file.
|
|
*/
|
|
|
|
#include <algorithm>
|
|
#include <cerrno>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <stdint.h>
|
|
#include <unistd.h>
|
|
#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__
|
|
#include <fcntl.h>
|
|
#include <io.h>
|
|
#endif
|
|
|
|
|
|
class State
|
|
{
|
|
int st;
|
|
|
|
public:
|
|
enum { states = 12 };
|
|
State() : st( 0 ) {}
|
|
int operator()() const { return st; }
|
|
bool is_char() const { return st < 7; }
|
|
|
|
void set_char()
|
|
{
|
|
const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
|
|
st = next[st];
|
|
}
|
|
void set_match() { st = ( st < 7 ) ? 7 : 10; }
|
|
void set_rep() { st = ( st < 7 ) ? 8 : 11; }
|
|
void set_short_rep() { st = ( st < 7 ) ? 9 : 11; }
|
|
};
|
|
|
|
|
|
enum {
|
|
min_dictionary_size = 1 << 12,
|
|
max_dictionary_size = 1 << 29,
|
|
literal_context_bits = 3,
|
|
literal_pos_state_bits = 0, // not used
|
|
pos_state_bits = 2,
|
|
pos_states = 1 << pos_state_bits,
|
|
pos_state_mask = pos_states - 1,
|
|
|
|
len_states = 4,
|
|
dis_slot_bits = 6,
|
|
start_dis_model = 4,
|
|
end_dis_model = 14,
|
|
modeled_distances = 1 << ( end_dis_model / 2 ), // 128
|
|
dis_align_bits = 4,
|
|
dis_align_size = 1 << dis_align_bits,
|
|
|
|
len_low_bits = 3,
|
|
len_mid_bits = 3,
|
|
len_high_bits = 8,
|
|
len_low_symbols = 1 << len_low_bits,
|
|
len_mid_symbols = 1 << len_mid_bits,
|
|
len_high_symbols = 1 << len_high_bits,
|
|
max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,
|
|
|
|
min_match_len = 2, // must be 2
|
|
|
|
bit_model_move_bits = 5,
|
|
bit_model_total_bits = 11,
|
|
bit_model_total = 1 << bit_model_total_bits };
|
|
|
|
struct Bit_model
|
|
{
|
|
int probability;
|
|
Bit_model() : probability( bit_model_total / 2 ) {}
|
|
};
|
|
|
|
struct Len_model
|
|
{
|
|
Bit_model choice1;
|
|
Bit_model choice2;
|
|
Bit_model bm_low[pos_states][len_low_symbols];
|
|
Bit_model bm_mid[pos_states][len_mid_symbols];
|
|
Bit_model bm_high[len_high_symbols];
|
|
};
|
|
|
|
|
|
class CRC32
|
|
{
|
|
uint32_t data[256]; // Table of CRCs of all 8-bit messages.
|
|
|
|
public:
|
|
CRC32()
|
|
{
|
|
for( unsigned n = 0; n < 256; ++n )
|
|
{
|
|
unsigned c = n;
|
|
for( int k = 0; k < 8; ++k )
|
|
{ if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
|
|
data[n] = c;
|
|
}
|
|
}
|
|
|
|
void update_buf( uint32_t & crc, const uint8_t * const buffer,
|
|
const int size ) const
|
|
{
|
|
for( int i = 0; i < size; ++i )
|
|
crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
|
|
}
|
|
};
|
|
|
|
const CRC32 crc32;
|
|
|
|
|
|
enum { header_size = 6, trailer_size = 20 };
|
|
typedef uint8_t Lzip_header[header_size]; // 0-3 magic bytes
|
|
// 4 version
|
|
// 5 coded dictionary size
|
|
typedef uint8_t Lzip_trailer[trailer_size];
|
|
// 0-3 CRC32 of the uncompressed data
|
|
// 4-11 size of the uncompressed data
|
|
// 12-19 member size including header and trailer
|
|
|
|
class Range_decoder
|
|
{
|
|
unsigned long long member_pos;
|
|
uint32_t code;
|
|
uint32_t range;
|
|
|
|
public:
|
|
Range_decoder()
|
|
: member_pos( header_size ), code( 0 ), range( 0xFFFFFFFFU )
|
|
{
|
|
get_byte(); // discard first byte of the LZMA stream
|
|
for( int i = 0; i < 4; ++i ) code = ( code << 8 ) | get_byte();
|
|
}
|
|
|
|
uint8_t get_byte() { ++member_pos; return std::getc( stdin ); }
|
|
unsigned long long member_position() const { return member_pos; }
|
|
|
|
unsigned decode( const int num_bits )
|
|
{
|
|
unsigned symbol = 0;
|
|
for( int i = num_bits; i > 0; --i )
|
|
{
|
|
range >>= 1;
|
|
symbol <<= 1;
|
|
if( code >= range ) { code -= range; symbol |= 1; }
|
|
if( range <= 0x00FFFFFFU ) // normalize
|
|
{ range <<= 8; code = ( code << 8 ) | get_byte(); }
|
|
}
|
|
return symbol;
|
|
}
|
|
|
|
bool decode_bit( Bit_model & bm )
|
|
{
|
|
bool symbol;
|
|
const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
|
|
if( code < bound )
|
|
{
|
|
range = bound;
|
|
bm.probability +=
|
|
( bit_model_total - bm.probability ) >> bit_model_move_bits;
|
|
symbol = 0;
|
|
}
|
|
else
|
|
{
|
|
code -= bound;
|
|
range -= bound;
|
|
bm.probability -= bm.probability >> bit_model_move_bits;
|
|
symbol = 1;
|
|
}
|
|
if( range <= 0x00FFFFFFU ) // normalize
|
|
{ range <<= 8; code = ( code << 8 ) | get_byte(); }
|
|
return symbol;
|
|
}
|
|
|
|
unsigned decode_tree( Bit_model bm[], const int num_bits )
|
|
{
|
|
unsigned symbol = 1;
|
|
for( int i = 0; i < num_bits; ++i )
|
|
symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
|
|
return symbol - ( 1 << num_bits );
|
|
}
|
|
|
|
unsigned decode_tree_reversed( Bit_model bm[], const int num_bits )
|
|
{
|
|
unsigned symbol = decode_tree( bm, num_bits );
|
|
unsigned reversed_symbol = 0;
|
|
for( int i = 0; i < num_bits; ++i )
|
|
{
|
|
reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 );
|
|
symbol >>= 1;
|
|
}
|
|
return reversed_symbol;
|
|
}
|
|
|
|
unsigned decode_matched( Bit_model bm[], const unsigned match_byte )
|
|
{
|
|
unsigned symbol = 1;
|
|
for( int i = 7; i >= 0; --i )
|
|
{
|
|
const bool match_bit = ( match_byte >> i ) & 1;
|
|
const bool bit = decode_bit( bm[symbol+(match_bit<<8)+0x100] );
|
|
symbol = ( symbol << 1 ) | bit;
|
|
if( match_bit != bit )
|
|
{
|
|
while( symbol < 0x100 )
|
|
symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
|
|
break;
|
|
}
|
|
}
|
|
return symbol & 0xFF;
|
|
}
|
|
|
|
unsigned decode_len( Len_model & lm, const int pos_state )
|
|
{
|
|
if( decode_bit( lm.choice1 ) == 0 )
|
|
return min_match_len +
|
|
decode_tree( lm.bm_low[pos_state], len_low_bits );
|
|
if( decode_bit( lm.choice2 ) == 0 )
|
|
return min_match_len + len_low_symbols +
|
|
decode_tree( lm.bm_mid[pos_state], len_mid_bits );
|
|
return min_match_len + len_low_symbols + len_mid_symbols +
|
|
decode_tree( lm.bm_high, len_high_bits );
|
|
}
|
|
};
|
|
|
|
|
|
class LZ_decoder
|
|
{
|
|
unsigned long long partial_data_pos;
|
|
Range_decoder rdec;
|
|
const unsigned dictionary_size;
|
|
uint8_t * const buffer; // output buffer
|
|
unsigned pos; // current pos in buffer
|
|
unsigned stream_pos; // first byte not yet written to stdout
|
|
uint32_t crc_;
|
|
bool pos_wrapped;
|
|
|
|
void flush_data();
|
|
|
|
uint8_t peek( const unsigned distance ) const
|
|
{
|
|
if( pos > distance ) return buffer[pos - distance - 1];
|
|
if( pos_wrapped ) return buffer[dictionary_size + pos - distance - 1];
|
|
return 0; // prev_byte of first byte
|
|
}
|
|
|
|
void put_byte( const uint8_t b )
|
|
{
|
|
buffer[pos] = b;
|
|
if( ++pos >= dictionary_size ) flush_data();
|
|
}
|
|
|
|
public:
|
|
explicit LZ_decoder( const unsigned dict_size )
|
|
:
|
|
partial_data_pos( 0 ),
|
|
dictionary_size( dict_size ),
|
|
buffer( new uint8_t[dictionary_size] ),
|
|
pos( 0 ),
|
|
stream_pos( 0 ),
|
|
crc_( 0xFFFFFFFFU ),
|
|
pos_wrapped( false )
|
|
{}
|
|
|
|
~LZ_decoder() { delete[] buffer; }
|
|
|
|
unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
|
|
unsigned long long data_position() const
|
|
{ return partial_data_pos + pos; }
|
|
uint8_t get_byte() { return rdec.get_byte(); }
|
|
unsigned long long member_position() const
|
|
{ return rdec.member_position(); }
|
|
|
|
bool decode_member();
|
|
};
|
|
|
|
|
|
void LZ_decoder::flush_data()
|
|
{
|
|
if( pos > stream_pos )
|
|
{
|
|
const unsigned size = pos - stream_pos;
|
|
crc32.update_buf( crc_, buffer + stream_pos, size );
|
|
if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
|
|
{ std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
|
|
std::exit( 1 ); }
|
|
if( pos >= dictionary_size )
|
|
{ partial_data_pos += pos; pos = 0; pos_wrapped = true; }
|
|
stream_pos = pos;
|
|
}
|
|
}
|
|
|
|
|
|
bool LZ_decoder::decode_member() // Return false if error
|
|
{
|
|
Bit_model bm_literal[1<<literal_context_bits][0x300];
|
|
Bit_model bm_match[State::states][pos_states];
|
|
Bit_model bm_rep[State::states];
|
|
Bit_model bm_rep0[State::states];
|
|
Bit_model bm_rep1[State::states];
|
|
Bit_model bm_rep2[State::states];
|
|
Bit_model bm_len[State::states][pos_states];
|
|
Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
|
|
Bit_model bm_dis[modeled_distances-end_dis_model+1];
|
|
Bit_model bm_align[dis_align_size];
|
|
Len_model match_len_model;
|
|
Len_model rep_len_model;
|
|
unsigned rep0 = 0; // rep[0-3] latest four distances
|
|
unsigned rep1 = 0; // used for efficient coding of
|
|
unsigned rep2 = 0; // repeated distances
|
|
unsigned rep3 = 0;
|
|
State state;
|
|
|
|
while( !std::feof( stdin ) && !std::ferror( stdin ) )
|
|
{
|
|
const int pos_state = data_position() & pos_state_mask;
|
|
if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 ) // 1st bit
|
|
{
|
|
// literal byte
|
|
const uint8_t prev_byte = peek( 0 );
|
|
const int literal_state = prev_byte >> ( 8 - literal_context_bits );
|
|
Bit_model * const bm = bm_literal[literal_state];
|
|
if( state.is_char() )
|
|
put_byte( rdec.decode_tree( bm, 8 ) );
|
|
else
|
|
put_byte( rdec.decode_matched( bm, peek( rep0 ) ) );
|
|
state.set_char();
|
|
continue;
|
|
}
|
|
// match or repeated match
|
|
int len;
|
|
if( rdec.decode_bit( bm_rep[state()] ) != 0 ) // 2nd bit
|
|
{
|
|
if( rdec.decode_bit( bm_rep0[state()] ) == 0 ) // 3rd bit
|
|
{
|
|
if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit
|
|
{ state.set_short_rep(); put_byte( peek( rep0 ) ); continue; }
|
|
}
|
|
else
|
|
{
|
|
unsigned distance;
|
|
if( rdec.decode_bit( bm_rep1[state()] ) == 0 ) // 4th bit
|
|
distance = rep1;
|
|
else
|
|
{
|
|
if( rdec.decode_bit( bm_rep2[state()] ) == 0 ) // 5th bit
|
|
distance = rep2;
|
|
else
|
|
{ distance = rep3; rep3 = rep2; }
|
|
rep2 = rep1;
|
|
}
|
|
rep1 = rep0;
|
|
rep0 = distance;
|
|
}
|
|
state.set_rep();
|
|
len = rdec.decode_len( rep_len_model, pos_state );
|
|
}
|
|
else // match
|
|
{
|
|
rep3 = rep2; rep2 = rep1; rep1 = rep0;
|
|
len = rdec.decode_len( match_len_model, pos_state );
|
|
const int len_state = std::min( len - min_match_len, len_states - 1 );
|
|
rep0 = rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits );
|
|
if( rep0 >= start_dis_model )
|
|
{
|
|
const unsigned dis_slot = rep0;
|
|
const int direct_bits = ( dis_slot >> 1 ) - 1;
|
|
rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
|
|
if( dis_slot < end_dis_model )
|
|
rep0 += rdec.decode_tree_reversed( bm_dis + ( rep0 - dis_slot ),
|
|
direct_bits );
|
|
else
|
|
{
|
|
rep0 +=
|
|
rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
|
|
rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
|
|
if( rep0 == 0xFFFFFFFFU ) // marker found
|
|
{
|
|
flush_data();
|
|
return len == min_match_len; // End Of Stream marker
|
|
}
|
|
}
|
|
}
|
|
state.set_match();
|
|
if( rep0 >= dictionary_size || ( rep0 >= pos && !pos_wrapped ) )
|
|
{ flush_data(); return false; }
|
|
}
|
|
for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) );
|
|
}
|
|
flush_data();
|
|
return false;
|
|
}
|
|
|
|
|
|
int main( const int argc, const char * const argv[] )
|
|
{
|
|
if( argc > 2 || ( argc == 2 && std::strcmp( argv[1], "-d" ) != 0 ) )
|
|
{
|
|
std::printf(
|
|
"Lzd %s - Educational decompressor for the lzip format.\n"
|
|
"Study the source code to learn how a lzip decompressor works.\n"
|
|
"See the lzip manual for an explanation of the code.\n"
|
|
"\nUsage: %s [-d] < file.lz > file\n"
|
|
"Lzd decompresses from standard input to standard output.\n"
|
|
"\nCopyright (C) 2024 Antonio Diaz Diaz.\n"
|
|
"License 2-clause BSD.\n"
|
|
"This is free software: you are free to change and redistribute "
|
|
"it.\nThere is NO WARRANTY, to the extent permitted by law.\n"
|
|
"Report bugs to lzip-bug@nongnu.org\n"
|
|
"Lzd home page: http://www.nongnu.org/lzip/lzd.html\n",
|
|
PROGVERSION, argv[0] );
|
|
return 0;
|
|
}
|
|
|
|
#if defined __MSVCRT__ || defined __OS2__ || defined __DJGPP__
|
|
setmode( STDIN_FILENO, O_BINARY );
|
|
setmode( STDOUT_FILENO, O_BINARY );
|
|
#endif
|
|
|
|
for( bool first_member = true; ; first_member = false )
|
|
{
|
|
Lzip_header header; // check header
|
|
for( int i = 0; i < header_size; ++i ) header[i] = std::getc( stdin );
|
|
if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 )
|
|
{
|
|
if( first_member )
|
|
{ std::fputs( "Bad magic number (file not in lzip format).\n",
|
|
stderr ); return 2; }
|
|
break; // ignore trailing data
|
|
}
|
|
unsigned dict_size = 1 << ( header[5] & 0x1F );
|
|
dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
|
|
if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
|
|
{ std::fputs( "Invalid dictionary size in member header.\n",
|
|
stderr ); return 2; }
|
|
|
|
LZ_decoder decoder( dict_size ); // decode LZMA stream
|
|
if( !decoder.decode_member() )
|
|
{ std::fputs( "Data error\n", stderr ); return 2; }
|
|
|
|
Lzip_trailer trailer; // check trailer
|
|
for( int i = 0; i < trailer_size; ++i ) trailer[i] = decoder.get_byte();
|
|
int retval = 0;
|
|
unsigned crc = 0;
|
|
for( int i = 3; i >= 0; --i ) crc = ( crc << 8 ) + trailer[i];
|
|
if( crc != decoder.crc() )
|
|
{ std::fputs( "CRC mismatch\n", stderr ); retval = 2; }
|
|
|
|
unsigned long long data_size = 0;
|
|
for( int i = 11; i >= 4; --i )
|
|
data_size = ( data_size << 8 ) + trailer[i];
|
|
if( data_size != decoder.data_position() )
|
|
{ std::fputs( "Data size mismatch\n", stderr ); retval = 2; }
|
|
|
|
unsigned long long member_size = 0;
|
|
for( int i = 19; i >= 12; --i )
|
|
member_size = ( member_size << 8 ) + trailer[i];
|
|
if( member_size != decoder.member_position() )
|
|
{ std::fputs( "Member size mismatch\n", stderr ); retval = 2; }
|
|
if( retval ) return retval;
|
|
}
|
|
|
|
if( std::fclose( stdout ) != 0 )
|
|
{ std::fprintf( stderr, "Error closing stdout: %s\n",
|
|
std::strerror( errno ) ); return 1; }
|
|
return 0;
|
|
}
|