lzd/lzd.cc

/*  Lzd - Educational decompressor for the lzip format
    Copyright (C) 2013-2015 Antonio Diaz Diaz.

    This program is free software: you have unlimited permission
    to copy, distribute and modify it.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/
/*
    Exit status: 0 for a normal exit, 1 for environmental problems
    (file not found, invalid flags, I/O errors, etc), 2 to indicate a
    corrupt or invalid input file.
*/

#include <algorithm>
#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <stdint.h>
#include <unistd.h>
#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER)
#include <fcntl.h>
#include <io.h>
#endif


class State
  {
  int st;

public:
  enum { states = 12 };
  State() : st( 0 ) {}
  int operator()() const { return st; }
  bool is_char() const { return st < 7; }

  void set_char()
    {
    static const int next[states] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5 };
    st = next[st];
    }
  void set_match()     { st = ( st < 7 ) ? 7 : 10; }
  void set_rep()       { st = ( st < 7 ) ? 8 : 11; }
  void set_short_rep() { st = ( st < 7 ) ? 9 : 11; }
  };


enum {
  min_dictionary_size = 1 << 12,
  max_dictionary_size = 1 << 29,
  literal_context_bits = 3,
  pos_state_bits = 2,
  pos_states = 1 << pos_state_bits,
  pos_state_mask = pos_states - 1,

  len_states = 4,
  dis_slot_bits = 6,
  start_dis_model = 4,
  end_dis_model = 14,
  modeled_distances = 1 << (end_dis_model / 2),		// 128
  dis_align_bits = 4,
  dis_align_size = 1 << dis_align_bits,

  len_low_bits = 3,
  len_mid_bits = 3,
  len_high_bits = 8,
  len_low_symbols = 1 << len_low_bits,
  len_mid_symbols = 1 << len_mid_bits,
  len_high_symbols = 1 << len_high_bits,
  max_len_symbols = len_low_symbols + len_mid_symbols + len_high_symbols,

  min_match_len = 2,					// must be 2

  bit_model_move_bits = 5,
  bit_model_total_bits = 11,
  bit_model_total = 1 << bit_model_total_bits };

struct Bit_model
  {
  int probability;
  Bit_model() : probability( bit_model_total / 2 ) {}
  };

struct Len_model
  {
  Bit_model choice1;
  Bit_model choice2;
  Bit_model bm_low[pos_states][len_low_symbols];
  Bit_model bm_mid[pos_states][len_mid_symbols];
  Bit_model bm_high[len_high_symbols];
  };


class CRC32
  {
  uint32_t data[256];		// Table of CRCs of all 8-bit messages.

public:
  CRC32()
    {
    for( unsigned n = 0; n < 256; ++n )
      {
      unsigned c = n;
      for( int k = 0; k < 8; ++k )
        { if( c & 1 ) c = 0xEDB88320U ^ ( c >> 1 ); else c >>= 1; }
      data[n] = c;
      }
    }

  void update_buf( uint32_t & crc, const uint8_t * const buffer,
                   const int size ) const
    {
    for( int i = 0; i < size; ++i )
      crc = data[(crc^buffer[i])&0xFF] ^ ( crc >> 8 );
    }
  };

const CRC32 crc32;


typedef uint8_t File_header[6];	// 0-3 magic, 4 version, 5 coded_dict_size

typedef uint8_t File_trailer[20];
			//  0-3  CRC32 of the uncompressed data
			//  4-11 size of the uncompressed data
			// 12-19 member size including header and trailer

class Range_decoder
  {
  uint32_t code;
  uint32_t range;

public:
  Range_decoder() : code( 0 ), range( 0xFFFFFFFFU )
    {
    for( int i = 0; i < 5; ++i ) code = (code << 8) | get_byte();
    }

  uint8_t get_byte() { return std::getc( stdin ); }

  int decode( const int num_bits )
    {
    int symbol = 0;
    for( int i = 0; i < num_bits; ++i )
      {
      range >>= 1;
      symbol <<= 1;
      if( code >= range ) { code -= range; symbol |= 1; }
      if( range <= 0x00FFFFFFU )			// normalize
        { range <<= 8; code = (code << 8) | get_byte(); }
      }
    return symbol;
    }

  int decode_bit( Bit_model & bm )
    {
    int symbol;
    const uint32_t bound = ( range >> bit_model_total_bits ) * bm.probability;
    if( code < bound )
      {
      range = bound;
      bm.probability += (bit_model_total - bm.probability) >> bit_model_move_bits;
      symbol = 0;
      }
    else
      {
      range -= bound;
      code -= bound;
      bm.probability -= bm.probability >> bit_model_move_bits;
      symbol = 1;
      }
    if( range <= 0x00FFFFFFU )				// normalize
      { range <<= 8; code = (code << 8) | get_byte(); }
    return symbol;
    }

  int decode_tree( Bit_model bm[], const int num_bits )
    {
    int symbol = 1;
    for( int i = 0; i < num_bits; ++i )
      symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
    return symbol - (1 << num_bits);
    }

  int decode_tree_reversed( Bit_model bm[], const int num_bits )
    {
    int symbol = decode_tree( bm, num_bits );
    int reversed_symbol = 0;
    for( int i = 0; i < num_bits; ++i )
      {
      reversed_symbol = ( reversed_symbol << 1 ) | ( symbol & 1 );
      symbol >>= 1;
      }
    return reversed_symbol;
    }

  int decode_matched( Bit_model bm[], const int match_byte )
    {
    Bit_model * const bm1 = bm + 0x100;
    int symbol = 1;
    for( int i = 7; i >= 0; --i )
      {
      const int match_bit = ( match_byte >> i ) & 1;
      const int bit = decode_bit( bm1[(match_bit<<8)+symbol] );
      symbol = ( symbol << 1 ) | bit;
      if( match_bit != bit )
        {
        while( symbol < 0x100 )
          symbol = ( symbol << 1 ) | decode_bit( bm[symbol] );
        break;
        }
      }
    return symbol & 0xFF;
    }

  int decode_len( Len_model & lm, const int pos_state )
    {
    if( decode_bit( lm.choice1 ) == 0 )
      return decode_tree( lm.bm_low[pos_state], len_low_bits );
    if( decode_bit( lm.choice2 ) == 0 )
      return len_low_symbols +
             decode_tree( lm.bm_mid[pos_state], len_mid_bits );
    return len_low_symbols + len_mid_symbols +
           decode_tree( lm.bm_high, len_high_bits );
    }
  };


class LZ_decoder
  {
  unsigned long long partial_data_pos;
  Range_decoder rdec;
  const unsigned dictionary_size;
  uint8_t * const buffer;	// output buffer
  unsigned pos;			// current pos in buffer
  unsigned stream_pos;		// first byte not yet written to stdout
  uint32_t crc_;

  void flush_data();

  uint8_t peek( const unsigned distance ) const
    {
    unsigned i = pos - distance - 1;
    if( pos <= distance ) i += dictionary_size;
    return buffer[i];
    }

  void put_byte( const uint8_t b )
    {
    buffer[pos] = b;
    if( ++pos >= dictionary_size ) flush_data();
    }

public:
  explicit LZ_decoder( const unsigned dict_size )
    :
    partial_data_pos( 0 ),
    dictionary_size( dict_size ),
    buffer( new uint8_t[dictionary_size] ),
    pos( 0 ),
    stream_pos( 0 ),
    crc_( 0xFFFFFFFFU )
    { buffer[dictionary_size-1] = 0; }		// prev_byte of first byte

  ~LZ_decoder() { delete[] buffer; }

  unsigned crc() const { return crc_ ^ 0xFFFFFFFFU; }
  unsigned long long data_position() const { return partial_data_pos + pos; }

  bool decode_member();
  };


void LZ_decoder::flush_data()
  {
  if( pos > stream_pos )
    {
    const unsigned size = pos - stream_pos;
    crc32.update_buf( crc_, buffer + stream_pos, size );
    errno = 0;
    if( std::fwrite( buffer + stream_pos, 1, size, stdout ) != size )
      { std::fprintf( stderr, "Write error: %s\n", std::strerror( errno ) );
        std::exit( 1 ); }
    if( pos >= dictionary_size ) { partial_data_pos += pos; pos = 0; }
    stream_pos = pos;
    }
  }


bool LZ_decoder::decode_member()		// Returns false if error
  {
  Bit_model bm_literal[1<<literal_context_bits][0x300];
  Bit_model bm_match[State::states][pos_states];
  Bit_model bm_rep[State::states];
  Bit_model bm_rep0[State::states];
  Bit_model bm_rep1[State::states];
  Bit_model bm_rep2[State::states];
  Bit_model bm_len[State::states][pos_states];
  Bit_model bm_dis_slot[len_states][1<<dis_slot_bits];
  Bit_model bm_dis[modeled_distances-end_dis_model];
  Bit_model bm_align[dis_align_size];
  Len_model match_len_model;
  Len_model rep_len_model;
  unsigned rep0 = 0;			// rep[0-3] latest four distances
  unsigned rep1 = 0;			// used for efficient coding of
  unsigned rep2 = 0;			// repeated distances
  unsigned rep3 = 0;
  State state;

  while( !std::feof( stdin ) && !std::ferror( stdin ) )
    {
    const int pos_state = data_position() & pos_state_mask;
    if( rdec.decode_bit( bm_match[state()][pos_state] ) == 0 )	// 1st bit
      {
      const uint8_t prev_byte = peek( 0 );
      const int literal_state = prev_byte >> ( 8 - literal_context_bits );
      Bit_model * const bm = bm_literal[literal_state];
      if( state.is_char() )
        put_byte( rdec.decode_tree( bm, 8 ) );
      else
        put_byte( rdec.decode_matched( bm, peek( rep0 ) ) );
      state.set_char();
      }
    else					// match or repeated match
      {
      int len;
      if( rdec.decode_bit( bm_rep[state()] ) != 0 )		// 2nd bit
        {
        if( rdec.decode_bit( bm_rep0[state()] ) != 0 )		// 3rd bit
          {
          unsigned distance;
          if( rdec.decode_bit( bm_rep1[state()] ) == 0 )	// 4th bit
            distance = rep1;
          else
            {
            if( rdec.decode_bit( bm_rep2[state()] ) == 0 )	// 5th bit
              distance = rep2;
            else
              { distance = rep3; rep3 = rep2; }
            rep2 = rep1;
            }
          rep1 = rep0;
          rep0 = distance;
          }
        else
          {
          if( rdec.decode_bit( bm_len[state()][pos_state] ) == 0 ) // 4th bit
            { state.set_short_rep(); put_byte( peek( rep0 ) ); continue; }
          }
        state.set_rep();
        len = min_match_len + rdec.decode_len( rep_len_model, pos_state );
        }
      else					// match
        {
        rep3 = rep2; rep2 = rep1; rep1 = rep0;
        len = min_match_len + rdec.decode_len( match_len_model, pos_state );
        const int len_state = std::min( len - min_match_len, len_states - 1 );
        const int dis_slot =
          rdec.decode_tree( bm_dis_slot[len_state], dis_slot_bits );
        if( dis_slot < start_dis_model ) rep0 = dis_slot;
        else
          {
          const int direct_bits = ( dis_slot >> 1 ) - 1;
          rep0 = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
          if( dis_slot < end_dis_model )
            rep0 += rdec.decode_tree_reversed( bm_dis + rep0 - dis_slot - 1,
                                               direct_bits );
          else
            {
            rep0 += rdec.decode( direct_bits - dis_align_bits ) << dis_align_bits;
            rep0 += rdec.decode_tree_reversed( bm_align, dis_align_bits );
            if( rep0 == 0xFFFFFFFFU )		// marker found
              {
              flush_data();
              return ( len == min_match_len );	// End Of Stream marker
              }
            }
          }
        state.set_match();
        if( rep0 >= dictionary_size || rep0 >= data_position() )
          { flush_data(); return false; }
        }
      for( int i = 0; i < len; ++i ) put_byte( peek( rep0 ) );
      }
    }
  flush_data();
  return false;
  }


int main( const int argc, const char * const argv[] )
  {
  if( argc > 1 )
    {
    std::printf( "Lzd %s - Educational decompressor for the lzip format.\n",
                 PROGVERSION );
    std::printf( "Study the source to learn how a lzip decompressor works.\n"
                 "See the lzip manual for an explanation of the code.\n"
                 "It is not safe to use lzd for any real work.\n"
                 "\nUsage: %s < file.lz > file\n", argv[0] );
    std::printf( "Lzd decompresses from standard input to standard output.\n"
                 "\nCopyright (C) 2015 Antonio Diaz Diaz.\n"
                 "This is free software: you are free to change and redistribute it.\n"
                 "There is NO WARRANTY, to the extent permitted by law.\n"
                 "Report bugs to lzip-bug@nongnu.org\n"
                 "Lzd home page: http://www.nongnu.org/lzip/lzd.html\n" );
    return 0;
    }

#if defined(__MSVCRT__) || defined(__OS2__) || defined(_MSC_VER)
  setmode( fileno( stdin ), O_BINARY );
  setmode( fileno( stdout ), O_BINARY );
#endif

  for( bool first_member = true; ; first_member = false )
    {
    File_header header;
    for( int i = 0; i < 6; ++i ) header[i] = std::getc( stdin );
    if( std::feof( stdin ) || std::memcmp( header, "LZIP\x01", 5 ) != 0 )
      {
      if( first_member )
        { std::fputs( "Bad magic number (file not in lzip format).\n", stderr );
          return 2; }
      break;
      }
    unsigned dict_size = 1 << ( header[5] & 0x1F );
    dict_size -= ( dict_size / 16 ) * ( ( header[5] >> 5 ) & 7 );
    if( dict_size < min_dictionary_size || dict_size > max_dictionary_size )
      { std::fputs( "Invalid dictionary size in member header.\n", stderr );
        return 2; }

    LZ_decoder decoder( dict_size );
    if( !decoder.decode_member() )
      { std::fputs( "Data error\n", stderr ); return 2; }

    File_trailer trailer;
    for( int i = 0; i < 20; ++i ) trailer[i] = std::getc( stdin );
    unsigned crc = 0;
    for( int i = 3; i >= 0; --i ) { crc <<= 8; crc += trailer[i]; }
    unsigned long long data_size = 0;
    for( int i = 11; i >= 4; --i ) { data_size <<= 8; data_size += trailer[i]; }
    if( crc != decoder.crc() || data_size != decoder.data_position() )
      { std::fputs( "CRC error\n", stderr ); return 2; }
    }

  if( std::fclose( stdout ) != 0 )
    { std::fprintf( stderr, "Can't close stdout: %s\n", std::strerror( errno ) );
      return 1; }
  return 0;
  }