Wednesday, July 25, 2012

Modifying gzip to only do huffman coding

Recently I needed to compress certain types of files on a system with very limited memory. After looking at LZ77, LZ78, and derivatives I chose simple huffman coding as it gave me ~50% savings on the file, with limited resources.

The first possible candidate was http://entropyware.info/shcodec/index.html

In the end, I modified gzip to output only huffman compressed data. I modified puff.c (included with zlib) to only care about these types of blocks.

Patch to zlib

--- ../gzip-1.4-orig/deflate.c  2010-01-03 12:26:02.000000000 -0500
+++ deflate.c   2012-07-25 11:47:30.000000000 -0400
@@ -673,8 +673,11 @@ off_t deflate()
     int flush;               /* set if current block must be flushed */
     int match_available = 0; /* set if previous match exists */
     register unsigned match_length = MIN_MATCH-1; /* length of best match */
+    extern int huffonly;       /* gzip.c */

+    if (huffonly == 0) {
     if (compr_level <= 3) return deflate_fast(); /* optimized for speed */
+    }

     /* Process the input block. */
     while (lookahead != 0) {
@@ -690,7 +693,8 @@ off_t deflate()

         if (hash_head != NIL && prev_length < max_lazy_match &&
             strstart - hash_head <= MAX_DIST &&
-            strstart <= window_size - MIN_LOOKAHEAD) {
+            strstart <= window_size - MIN_LOOKAHEAD &&
+           huffonly==0) {
             /* To simplify the code, we prevent matches with the string
              * of window index 0 (in particular we have to avoid a match
              * of the string with itself at the start of the input file).
@@ -710,7 +714,8 @@ off_t deflate()
         /* If there was a match at the previous step and the current
          * match is not better, output the previous match:
          */
-        if (prev_length >= MIN_MATCH && match_length <= prev_length) {
+        if (prev_length >= MIN_MATCH && match_length <= prev_length
+               && huffonly==0) {

             check_match(strstart-1, prev_match, prev_length);

--- ../gzip-1.4-orig/gzip.c     2010-01-03 12:26:02.000000000 -0500
+++ gzip.c      2012-07-25 11:53:29.000000000 -0400
@@ -194,6 +194,7 @@ char *env;            /* contents of GZI
 char **args = NULL;   /* argv pointer if GZIP env variable defined */
 char const *z_suffix; /* default suffix (can be set with --suffix) */
 size_t z_len;         /* strlen(z_suffix) */
+int huffonly = 0;   /* only use huffman codes, no LZ77 */

 /* The set of signals that are caught.  */
 static sigset_t caught_signals;
@@ -271,6 +272,7 @@ struct option longopts[] =
     {"best",       0, 0, '9'}, /* compress better */
     {"lzw",        0, 0, 'Z'}, /* make output compatible with old compress */
     {"bits",       1, 0, 'b'}, /* max number of bits per code (implies -Z) */
+    {"huffonly",   0, 0, 'O'}, /* ascii text mode */
     { 0, 0, 0, 0 }
 };

@@ -352,6 +354,7 @@ local void help()
  "  -Z, --lzw         produce output compatible with old compress",
  "  -b, --bits=BITS   max number of bits per code (implies -Z)",
 #endif
+ "  -O, --huffonly    only compress using dynamic huffman codes (block type 2)",
  "",
  "With no FILE, or when FILE is -, read standard input.",
  "",
@@ -502,6 +505,9 @@ int main (int argc, char **argv)
            try_help ();
            break;
 #endif
+       case 'O':
+           huffonly = 1;
+           break;
        case '1':  case '2':  case '3':  case '4':
        case '5':  case '6':  case '7':  case '8':  case '9':
            level = optc - '0';     

Changes to puff.c

--- puff.orig.c 2010-04-25 05:04:16.000000000 -0400
+++ puff.new.c  2012-07-25 13:33:16.000000000 -0400
@@ -160,6 +160,7 @@ local int bits(struct state *s, int need
  * - A stored block can have zero length.  This is sometimes used to byte-align
  *   subsets of the compressed data for random access or partial recovery.
  */
+#ifndef CONFIG_ONLY_BLOCK_2
 local int stored(struct state *s)
 {
     unsigned len;       /* length of stored block */
@@ -194,6 +195,7 @@ local int stored(struct state *s)
     /* done with a valid stored block */
     return 0;
 }
+#endif

 /*
  * Huffman code decoding tables.  count[1..MAXBITS] is the number of symbols of
@@ -437,6 +439,7 @@ local int codes(struct state *s,
                 const struct huffman *distcode)
 {
     int symbol;         /* decoded symbol */
+#ifndef CONFIG_ONLY_BLOCK_2
     int len;            /* length for copy */
     unsigned dist;      /* distance for copy */
     static const short lens[29] = { /* Size base for length codes 257..285 */
@@ -453,6 +456,7 @@ local int codes(struct state *s,
         0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
         7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
         12, 12, 13, 13};
+#endif

     /* decode literals and length/distance pairs */
     do {
@@ -469,6 +473,10 @@ local int codes(struct state *s,
             s->outcnt++;
         }
         else if (symbol > 256) {        /* length */
+#ifdef CONFIG_ONLY_BLOCK_2
+           /* I never put out anything but literals */
+           return -11;
+#else /* CONFIG_ONLY_BLOCK_2 */
             /* get and compute length */
             symbol -= 257;
             if (symbol >= 29)
@@ -501,6 +509,7 @@ local int codes(struct state *s,
             }
             else
                 s->outcnt += len;
+#endif /* CONFIG_ONLY_BLOCK_2 */
         }
     } while (symbol != 256);            /* end of block symbol */

@@ -532,6 +541,7 @@ local int codes(struct state *s,
  *   length, this can be implemented as an incomplete code.  Then the invalid
  *   codes are detected while decoding.
  */
+#ifndef CONFIG_ONLY_BLOCK_2
 local int fixed(struct state *s)
 {
     static int virgin = 1;
@@ -573,6 +583,7 @@ local int fixed(struct state *s)
     /* decode data until end-of-block code */
     return codes(s, &lencode, &distcode);
 }
+#endif

 /*
  * Process a dynamic codes block.
@@ -668,7 +679,9 @@ local int dynamic(struct state *s)
     int err;                            /* construct() return value */
     short lengths[MAXCODES];            /* descriptor code lengths */
     short lencnt[MAXBITS+1], lensym[MAXLCODES];         /* lencode memory */
+#ifndef CONFIG_ONLY_BLOCK_2
     short distcnt[MAXBITS+1], distsym[MAXDCODES];       /* distcode memory */
+#endif
     struct huffman lencode, distcode;   /* length and distance codes */
     static const short order[19] =      /* permutation of code length codes */
         {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
@@ -676,8 +689,10 @@ local int dynamic(struct state *s)
     /* construct lencode and distcode */
     lencode.count = lencnt;
     lencode.symbol = lensym;
+#ifndef CONFIG_ONLY_BLOCK_2
     distcode.count = distcnt;
     distcode.symbol = distsym;
+#endif

     /* get number of lengths in each table, check lengths */
     nlen = bits(s, 5) + 257;
@@ -735,9 +750,11 @@ local int dynamic(struct state *s)
         return -7;      /* incomplete code ok only for single length 1 code */

     /* build huffman table for distance codes */
+#ifndef CONFIG_ONLY_BLOCK_2
     err = construct(&distcode, lengths + nlen, ndist);
     if (err && (err < 0 || ndist != distcode.count[0] + distcode.count[1]))
         return -8;      /* incomplete code ok only for single length 1 code */
+#endif

     /* decode data until end-of-block code */
     return codes(s, &lencode, &distcode);
@@ -816,6 +833,7 @@ int puff(unsigned char *dest,
         do {
             last = bits(&s, 1);         /* one if last block */
             type = bits(&s, 2);         /* block type 0..3 */
+#ifndef CONFIG_ONLY_BLOCK_2
             err = type == 0 ?
                     stored(&s) :
                     (type == 1 ?
@@ -823,6 +841,10 @@ int puff(unsigned char *dest,
                         (type == 2 ?
                             dynamic(&s) :
                             -1));       /* type == 3, invalid */
+#else
+           /* only support block type 2 */
+            err = type == 2 ? dynamic(&s) : -1;
+#endif
             if (err != 0)
                 break;                  /* return with error */
         } while (!last);

0 comments:

Post a Comment