The Data Compression Book-:LZ78 Compression

The Code

The source code for a complete twelve-bit version of LZW compression and decompression follows.
/************************* Start of LZW12.C **************************
*
* This is 12 bit LZW program, which is discussed in the first part
* of the chapter. It uses a fixed size code, and does not attempt
* to flush the dictionary after it fills up.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "errand.h"
#include "bitio.h"

/*
* Constants used throughout the program. BITS defines how many bits
* will be in a code. TABLE_SIZE defines the size of the dictionary
* table.
*/
#define BITS                    12
#define MAX_CODE                ( ( 1 << BITS ) - 1 )
#define TABLE_SIZE              5021
#define END_OF_STREAM           256
#define FIRST_CODE              257
#define UNUSED                  -1
/*
* Local prototypes.
*/
#ifdef __STDC__
unsigned int find_child_node( int parent_code, int child_character );
unsigned int decode_string( unsigned int offset, unsigned int code );
#else
unsigned int find_child_node ();
unsigned int decode_string ();
#endif

char *CompressionName = "LZW 12 Bit Encoder";
char *Usage  = "in-file out-file\n\n";

/*
* This data structure defines the dictionary. Each entry in the
* dictionary has a code value. This is the code emitted by the
* compressor. Each code is actually made up of two piece: a
* parent_code, and a character. Code values of less than 256 are
* actually plain text codes.
*/

struct dictionary {
 int code_value;
 int parent_code;
 char character;
} dict[TABLE_SIZE ];

char decode_stack[ TABLE_SIZE };

/*
* The compressor is short and simple. It reads in new symbols one
* at a time from the input file. It then checks to see if the
* combination of the current symbol and the current code are already
* defined in the dictionary. If they are not, they are added to the
* dictionary, and we start over with a new one symbol code. If they
* are, the code for the combination of the code and character becomes
* our new code.
*/

void CompressFile( input, output, argc, argv )
FILE *input;
BIT_FILE *output;
int argc;
char *argv[];
{
  int next_code;
  int character;
  int string_code;
  unsigned int index;
  unsigned int i;

  next_code = FIRST_CODE;
  for ( i = 0 ; i < TABLE_SIZE ; i++ )
   dict[ i ].code_value = UNUSED;
  if ( ( string_code = getc( input ) )== EOF )
   string_code = END_OF_STREAM;
  while ( ( character = getc( input ) ) != EOF ) {
   index = find_child_node( string_code, character );
   if ( dict[ index ].code_value != - 1)
    string_code = dict[ index ].code_value;
   else {
    if ( next_code <= MAX_CODE ) {
    dict[ index ].code_value = next_code++;
    dict[ index ].parent_code = string_code;
    dict[ index ].character = (char) character;
   }
   OutputBits( output, (unsigned long) string_code, BITS );
   string_code = character;
  }
 }
 OutputBits( output, (unsigned long) string_code, BITS );
 OutputBits( output, (unsigned long) END_OF_STREAM, BITS );
 while ( argc-- > 0 )
  printf( "Unknown argument: %s\n", *argv++ ); }

/*
* The file expander operates much like the encoder. It has to
* read in codes, then convert the codes to a string of characters.
* The only catch in the whole operation occurs when the encoder
* encounters a CHAR+STRING+CHAR+STRING+CHAR sequence. When this
* occurs, the encoder outputs a code that is not presently defined
* in the table. This is handled as an exception.
*/
void ExpandFile( input, output, argc, argv )
BIT_FILE *input;
FILE *output;
int argc;
char *argv[];
{
  unsigned int next_code;
  unsigned int new_code;
  unsigned int old_code;
  int character;
  unsigned int count;

  next_code = FIRST_CODE;
  old_code = (unsigned int) InputBits( input, BITS );
  if ( old_code == END_OF_STREAM )
   return;
  character = old_code;
  putc( old_code, output );
while ( ( new_code = (unsigned int) InputBits( input, BITS ) )
          != END_OF_STREAM ) {
/*
** This code checks for the CHARACTER+STRING+CHARACTER+STRING+CHARACTER
** case which generates an undefined code. It handles it by decoding
** the last code, and adding a single character to the end of the
** decode string.
*/
  if (new_code >= next_code ) {
   decode_stack[ 0 ] = (char) character;
   count = decode_string( 1, old_code );
  }
  else
   count  = decode_string( 0, new_code );
  character = decode_string[ count - 1 ];
  while ( count > 0 )
   putc( decode_stack[ --count ], output );
  if ( next_code <= MAX_CODE ) {
   dict[ next_code ].parent_code = old_code;
   dict[ next_code ].character = (char) character;
   next_code++;
  }
  old_code = new_code;
  }
  while ( argc-- > 0 )
  print( "Unknown argument: %s\n", *argv++ );
}

/*
* This hashing routine is responsible for finding the table location
* for a string/character combination. The table index is created
* by using an exclusive OR combination of the prefix and character.
* This code also has to check for collisions, and handles them by
* jumping around in the table.

*/
unsigned int find_child_node( parent_code, child_character )
int parent_code;
int child_character;
{
 int index;
 int offset;
 
 index = ( child_character << ( BITS - 8 ) ) ^ parent_code;
 if ( index == 0 )
  offset = 1;
 else
  offset = TABLE_SIZE - index;
 for ( ; ; ) {
  if ( dict[ index ].code_value == UNUSED )
   return( index );
  if ( dict[ index ].parent_code == parent_code &&
   dict[ index ].character == (char) child_character )
   return( index );
  index -= offset;
  if ( index < 0 )
   index += TABLE_SIZE;
 }
}
/*
* This routine decodes a string from the dictionary, and stores it
* in the decode_stack data structure. It returns a count to the
* calling program of how many characters were placed in the stack.
*/

unsigned int decode_string( count, code )
unsigned int count;
unsigned int code;
{
 while ( code > 255 ) {
  decode_stack[ count++ ] = dict[ code ].character;
  code = dict[ code ].parent_code;
  }
  decode_stack[ count++ ] = (char) code;
  return( count );
}

/*************************** End of LZW12.C ****************************/
Table of Contents