Recently I needed to compress certain types of files on a system with very limited memory. After looking at LZ77, LZ78, and derivatives I chose simple huffman coding as it gave me ~50% savings on the file, with limited resources.
The first possible candidate was http://entropyware.info/shcodec/index.html
In the end, I modified gzip to output only huffman compressed data. I modified puff.c (included with zlib) to only care about these types of blocks.
Patch to zlib
--- ../gzip-1.4-orig/deflate.c 2010-01-03 12:26:02.000000000 -0500 +++ deflate.c 2012-07-25 11:47:30.000000000 -0400 @@ -673,8 +673,11 @@ off_t deflate() int flush; /* set if current block must be flushed */ int match_available = 0; /* set if previous match exists */ register unsigned match_length = MIN_MATCH-1; /* length of best match */ + extern int huffonly; /* gzip.c */ + if (huffonly == 0) { if (compr_level <= 3) return deflate_fast(); /* optimized for speed */ + } /* Process the input block. */ while (lookahead != 0) { @@ -690,7 +693,8 @@ off_t deflate() if (hash_head != NIL && prev_length < max_lazy_match && strstart - hash_head <= MAX_DIST && - strstart <= window_size - MIN_LOOKAHEAD) { + strstart <= window_size - MIN_LOOKAHEAD && + huffonly==0) { /* To simplify the code, we prevent matches with the string * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). @@ -710,7 +714,8 @@ off_t deflate() /* If there was a match at the previous step and the current * match is not better, output the previous match: */ - if (prev_length >= MIN_MATCH && match_length <= prev_length) { + if (prev_length >= MIN_MATCH && match_length <= prev_length + && huffonly==0) { check_match(strstart-1, prev_match, prev_length); --- ../gzip-1.4-orig/gzip.c 2010-01-03 12:26:02.000000000 -0500 +++ gzip.c 2012-07-25 11:53:29.000000000 -0400 @@ -194,6 +194,7 @@ char *env; /* contents of GZI char **args = NULL; /* argv pointer if GZIP env variable defined */ char const *z_suffix; /* default suffix (can be set with --suffix) */ size_t z_len; /* strlen(z_suffix) */ +int huffonly = 0; /* only use huffman codes, no LZ77 */ /* The set of signals that are caught. */ static sigset_t caught_signals; @@ -271,6 +272,7 @@ struct option longopts[] = {"best", 0, 0, '9'}, /* compress better */ {"lzw", 0, 0, 'Z'}, /* make output compatible with old compress */ {"bits", 1, 0, 'b'}, /* max number of bits per code (implies -Z) */ + {"huffonly", 0, 0, 'O'}, /* ascii text mode */ { 0, 0, 0, 0 } }; @@ -352,6 +354,7 @@ local void help() " -Z, --lzw produce output compatible with old compress", " -b, --bits=BITS max number of bits per code (implies -Z)", #endif + " -O, --huffonly only compress using dynamic huffman codes (block type 2)", "", "With no FILE, or when FILE is -, read standard input.", "", @@ -502,6 +505,9 @@ int main (int argc, char **argv) try_help (); break; #endif + case 'O': + huffonly = 1; + break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': level = optc - '0';
Changes to puff.c
--- puff.orig.c 2010-04-25 05:04:16.000000000 -0400 +++ puff.new.c 2012-07-25 13:33:16.000000000 -0400 @@ -160,6 +160,7 @@ local int bits(struct state *s, int need * - A stored block can have zero length. This is sometimes used to byte-align * subsets of the compressed data for random access or partial recovery. */ +#ifndef CONFIG_ONLY_BLOCK_2 local int stored(struct state *s) { unsigned len; /* length of stored block */ @@ -194,6 +195,7 @@ local int stored(struct state *s) /* done with a valid stored block */ return 0; } +#endif /* * Huffman code decoding tables. count[1..MAXBITS] is the number of symbols of @@ -437,6 +439,7 @@ local int codes(struct state *s, const struct huffman *distcode) { int symbol; /* decoded symbol */ +#ifndef CONFIG_ONLY_BLOCK_2 int len; /* length for copy */ unsigned dist; /* distance for copy */ static const short lens[29] = { /* Size base for length codes 257..285 */ @@ -453,6 +456,7 @@ local int codes(struct state *s, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; +#endif /* decode literals and length/distance pairs */ do { @@ -469,6 +473,10 @@ local int codes(struct state *s, s->outcnt++; } else if (symbol > 256) { /* length */ +#ifdef CONFIG_ONLY_BLOCK_2 + /* I never put out anything but literals */ + return -11; +#else /* CONFIG_ONLY_BLOCK_2 */ /* get and compute length */ symbol -= 257; if (symbol >= 29) @@ -501,6 +509,7 @@ local int codes(struct state *s, } else s->outcnt += len; +#endif /* CONFIG_ONLY_BLOCK_2 */ } } while (symbol != 256); /* end of block symbol */ @@ -532,6 +541,7 @@ local int codes(struct state *s, * length, this can be implemented as an incomplete code. Then the invalid * codes are detected while decoding. */ +#ifndef CONFIG_ONLY_BLOCK_2 local int fixed(struct state *s) { static int virgin = 1; @@ -573,6 +583,7 @@ local int fixed(struct state *s) /* decode data until end-of-block code */ return codes(s, &lencode, &distcode); } +#endif /* * Process a dynamic codes block. @@ -668,7 +679,9 @@ local int dynamic(struct state *s) int err; /* construct() return value */ short lengths[MAXCODES]; /* descriptor code lengths */ short lencnt[MAXBITS+1], lensym[MAXLCODES]; /* lencode memory */ +#ifndef CONFIG_ONLY_BLOCK_2 short distcnt[MAXBITS+1], distsym[MAXDCODES]; /* distcode memory */ +#endif struct huffman lencode, distcode; /* length and distance codes */ static const short order[19] = /* permutation of code length codes */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; @@ -676,8 +689,10 @@ local int dynamic(struct state *s) /* construct lencode and distcode */ lencode.count = lencnt; lencode.symbol = lensym; +#ifndef CONFIG_ONLY_BLOCK_2 distcode.count = distcnt; distcode.symbol = distsym; +#endif /* get number of lengths in each table, check lengths */ nlen = bits(s, 5) + 257; @@ -735,9 +750,11 @@ local int dynamic(struct state *s) return -7; /* incomplete code ok only for single length 1 code */ /* build huffman table for distance codes */ +#ifndef CONFIG_ONLY_BLOCK_2 err = construct(&distcode, lengths + nlen, ndist); if (err && (err < 0 || ndist != distcode.count[0] + distcode.count[1])) return -8; /* incomplete code ok only for single length 1 code */ +#endif /* decode data until end-of-block code */ return codes(s, &lencode, &distcode); @@ -816,6 +833,7 @@ int puff(unsigned char *dest, do { last = bits(&s, 1); /* one if last block */ type = bits(&s, 2); /* block type 0..3 */ +#ifndef CONFIG_ONLY_BLOCK_2 err = type == 0 ? stored(&s) : (type == 1 ? @@ -823,6 +841,10 @@ int puff(unsigned char *dest, (type == 2 ? dynamic(&s) : -1)); /* type == 3, invalid */ +#else + /* only support block type 2 */ + err = type == 2 ? dynamic(&s) : -1; +#endif if (err != 0) break; /* return with error */ } while (!last);
0 comments:
Post a Comment