commit 7b98214df9de69821c5e453e7da7ade99e8516e7
parent 1e706566a0e7dacd026a988cc2f659e5a6657919
Author: nsz <nszabolcs@gmail.com>
Date: Sun, 12 Jul 2009 15:52:53 +0200
simplified deflate
Diffstat:
deflate_simple.c | | | 548 | +++++++++++++++++++++++++++++++------------------------------------------------ |
1 file changed, 213 insertions(+), 335 deletions(-)
diff --git a/deflate_simple.c b/deflate_simple.c
@@ -2,18 +2,27 @@
#include <string.h>
#include <stdio.h>
+
+/*
+TODO:
+ hufflen: parent+heap only
+ match: check backwards
+*/
+
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
enum {
MinMatch = 3,
- MaxMatch = 258, /* TODO: unused */
+ MaxMatch = 258,
MaxChainStep = 64,
HashSize = 2039,
HashChars = 3, /* TODO: unused */
WinSize = 1 << 15,
- SymSize = 1 << 16
+ SymSize = 1 << 16,
+
+ LzRunlen = 1<<30
};
enum {
@@ -25,19 +34,6 @@ enum {
Nclen = 19 /* number of code length codes */
};
-enum {
- SymLitlen,
- SymDist,
- SymExtra,
- SymClen
-};
-
-typedef struct {
- uchar type; /* litlen, dist, extra, clen */
- uchar len; /* bit length if extra */
- ushort n; /* symbol or code if extra */
-} Sym;
-
typedef struct {
ushort dist;
ushort len;
@@ -48,18 +44,23 @@ typedef struct {
ushort chain[WinSize]; /* hash chains */
int pos; /* position in hash chain */
- Sym symbuf[SymSize]; /* lz77 literal and (len,dist) sequence */
- int symstart;
- int symlen;
+ uchar *src;
+ int lzbuf[WinSize]; /* literal runlength and (len,dist) sequence */
+ int *lzp;
+ int runlen;
+
+ int lfreq[Nlitlen];
+ int dfreq[Ndist];
+/*
+ int lcount;
+ int dcount;
+*/
uchar *dst; /* compressed output */
uint bits;
int nbits;
- int type;
- int firstblock;
int lastblock;
- int finished;
} State;
static uchar fixed_llen[Nlitlen]; /* fixed lit/len huffman code tree */
@@ -243,10 +244,10 @@ static void hufflens(uchar *lens, int *freqs, int nsym, int limit) {
}
}
-/* adding < 16 bits to dst */
-static void outbits(State *s, uint bits, int nbits) {
+/* adding n (<= 16) bits to dst */
+static void putbits(State *s, uint bits, int n) {
s->bits |= bits << s->nbits;
- s->nbits += nbits;
+ s->nbits += n;
while (s->nbits >= 8) {
*s->dst++ = s->bits & 0xff;
s->bits >>= 8;
@@ -254,324 +255,187 @@ static void outbits(State *s, uint bits, int nbits) {
}
}
-static Sym makesym(int type, int len, int sym) {
- Sym s;
-
- s.type = type;
- s.len = len;
- s.n = sym;
- return s;
-}
-
-static void chooseblock(State *s);
-
-static void putsym(State *s, Sym sym) {
- s->symbuf[(s->symstart + s->symlen++) % SymSize] = sym;
- if (s->symlen == SymSize)
- chooseblock(s);
-}
-
-static Sym getsym(State *s, int i) {
- return s->symbuf[(s->symstart + i) % SymSize];
-}
-
-static int clensymbols(Sym *syms, uchar *llen, int nlit, uchar *dlen, int ndist) {
- int lens[Nlitlen + Ndist];
- int i, j;
- int runlen;
- int nsym = 0;
+static int clencodes(int *codes, int *extra, uchar *llen, int nlit, uchar *dlen, int ndist) {
+ int i, c, r, rb;
+ int n = 0;
+/* int exbits = 0;*/
for (i = 0; i < nlit; i++)
- lens[i] = llen[i];
+ codes[i] = llen[i];
for (i = 0; i < ndist; i++)
- lens[nlit + i] = dlen[i];
- for (i = j = 0; i < nlit + ndist; i = j) {
- for (j++; j < nlit + ndist; j++)
- if (lens[j] != lens[i])
- break;
- runlen = j - i;
- /* encode economically */
- if (lens[i] == 0) {
- if (runlen < 3)
- while (runlen--)
- syms[nsym++] = makesym(SymClen, 0, 0);
- else
- while (runlen > 0) {
- int k = runlen < 138 ? runlen : 138;
-
- if (k > runlen - 3 && k < runlen)
- k = runlen - 3;
- if (k < 11) {
- syms[nsym++] = makesym(SymClen, 0, 17);
- syms[nsym++] = makesym(SymExtra, 3, k - 3);
- } else {
- syms[nsym++] = makesym(SymClen, 0, 18);
- syms[nsym++] = makesym(SymExtra, 7, k - 11);
- }
- runlen -= k;
- }
+ codes[nlit + i] = dlen[i];
+ for (i = 0; i < nlit + ndist;) {
+ /* run length encoding */
+ c = codes[i];
+ for (r = 1; r < nlit + ndist && codes[i + r] == c; r++);
+ i += r;
+ if (c == 0) {
+ while (r >= 11) {
+ rb = r > 138 ? 138 : r;
+ codes[n] = 18;
+ extra[n++] = rb - 11;
+ r -= rb;
+/* exbits += 7;*/
+ }
+ if (r >= 3) {
+ codes[n] = 17;
+ extra[n++] = r - 3;
+ r = 0;
+/* exbits += 3;*/
+ }
+ }
+ while (r--) {
+ codes[n++] = c;
+ while (r >= 3) {
+ rb = r > 6 ? 6 : r;
+ codes[n] = 16;
+ extra[n++] = rb - 3;
+ r -= rb;
+/* exbits += 2;*/
+ }
+ }
+ }
+ return n;
+}
+
+static void putblock(State *s, ushort *lcode, uchar *llen, ushort *dcode, uchar *dlen) {
+ int c, lit;
+ int *p;
+ uchar *plit;
+
+ for (p = s->lzbuf, plit = s->src; p != s->lzp;) {
+ c = *p++;
+ if (c >= LzRunlen) {
+ c %= LzRunlen;
+ while (c--) {
+ putbits(s, lcode[*plit], llen[*plit]);
+ plit++;
+ }
} else {
- syms[nsym++] = makesym(SymClen, 0, lens[i]);
- runlen--;
- if (runlen < 3)
- while (runlen--)
- syms[nsym++] = makesym(SymClen, 0, lens[i]);
- else
- while (runlen > 0) {
- int k = runlen < 6 ? runlen : 6;
-
- if (k > runlen - 3 && k < runlen)
- k = runlen - 3;
- syms[nsym++] = makesym(SymClen, 0, 16);
- syms[nsym++] = makesym(SymExtra, 2, k - 3);
- runlen -= k;
- }
+ lit = (c & 0xff) + Nlit + 1;
+ putbits(s, lcode[lit], llen[lit]); /* TODO: lenbits[] */
+ putbits(s, c >> 16, (c >> 8) & 0xff);
+ plit += lenbase[c & 0xff] + (c >> 16);
+ c = *p++;
+ putbits(s, dcode[c & 0xff], dlen[c & 0xff]);
+ putbits(s, c >> 16, (c >> 8) & 0xff);
+
}
}
- return nsym;
+ /* EOB */
+ putbits(s, lcode[256], llen[256]);
}
-/* build dynamic block or a fixed block of given length */
-static void block(State *s, int dynlen, int fixlen) {
- int lfreq[Nlitlen], dfreq[Ndist], cfreq[Nclen];
+static void emitblock(State *s, int blocklen) {
+ int cfreq[Nclen];
uchar llen[Nlitlen], dlen[Ndist], clen[Nclen];
ushort lcode[Nlitlen], dcode[Ndist], ccode[Nclen];
- Sym syms[Nlitlen + Ndist];
- Sym sym;
- int nsym;
- int i, nlit, ndist, nclen;
- int dynsize, fixsize;
- int blocklen;
-
- memset(lfreq, 0, sizeof(lfreq));
- memset(dfreq, 0, sizeof(dfreq));
- lfreq[256] = 1; /* EOB */
- for (i = 0; i < dynlen; i++) {
- sym = getsym(s, i);
- if (sym.type == SymLitlen)
- lfreq[sym.n]++;
- else if (sym.type == SymDist)
- dfreq[sym.n]++;
- }
- hufflens(llen, lfreq, Nlitlen, CodeBits-1);
- hufflens(dlen, dfreq, Ndist, CodeBits-1);
+ int codes[Nlitlen + Ndist], extra[Nlitlen + Ndist];
+ int i, c, nc;
+ int nlit, ndist, nclen;
+ int dynsize, fixsize, uncsize;
+ int *p;
+ uchar *plit;
+
+ hufflens(llen, s->lfreq, Nlitlen, CodeBits-1);
+ hufflens(dlen, s->dfreq, Ndist, CodeBits-1);
huffcodes(lcode, llen, Nlitlen);
huffcodes(dcode, dlen, Ndist);
for (nlit = Nlitlen; nlit > Nlit && llen[nlit-1] == 0; nlit--);
for (ndist = Ndist; ndist > 1 && dlen[ndist-1] == 0; ndist--);
- nsym = clensymbols(syms, llen, nlit, dlen, ndist);
+ nc = clencodes(codes, extra, llen, nlit, dlen, ndist);
memset(cfreq, 0, sizeof(cfreq));
- for (i = 0; i < nsym; i++) {
- sym = syms[i];
- if (sym.type == SymClen)
- cfreq[sym.n]++;
- }
+ for (i = 0; i < nc; i++)
+ cfreq[codes[i]]++;
hufflens(clen, cfreq, Nclen, 7);
huffcodes(ccode, clen, Nclen);
for (nclen = Nclen; nclen > 4 && clen[clenorder[nclen-1]] == 0; nclen--);
/* calc size */
+ uncsize = 3 + 4 + 8*blocklen; /* +byte alignment */
+ fixsize = 3;
dynsize = 3 + 5 + 5 + 4;
dynsize += 3 * nclen;
- for (i = 0; i < nsym; i++) {
- sym = syms[i];
- if (sym.type == SymClen)
- dynsize += clen[sym.n];
- if (sym.type == SymExtra)
- dynsize += sym.len;
- }
-fprintf(stderr, "tree len: %d bits: %d\n", nsym, dynsize);
- for (i = 0; i < dynlen; i++) {
- sym = getsym(s, i);
- if (sym.type == SymLitlen)
- dynsize += llen[sym.n];
- if (sym.type == SymDist)
- dynsize += dlen[sym.n];
- if (sym.type == SymExtra)
- dynsize += sym.len;
+ for (i = 0; i < nc; i++) {
+ c = codes[i];
+ dynsize += clen[c];
+ if (c == 16)
+ dynsize += 2;
+ if (c == 17)
+ dynsize += 3;
+ if (c == 18)
+ dynsize += 7;
}
- dynsize += llen[256]; /* EOB */
-
- fixsize = 3;
- for (i = 0; i < fixlen; i++) {
- sym = getsym(s, i);
- if (sym.type == SymLitlen)
- fixsize += fixed_llen[sym.n];
- if (sym.type == SymDist)
- fixsize += fixed_dlen[sym.n];
- if (sym.type == SymExtra)
- fixsize += sym.len;
+fprintf(stderr, "tree len: %d bits: %d\n", nc, dynsize);
+ for (p = s->lzbuf, plit = s->src; p != s->lzp;) {
+ c = *p++;
+ if (c >= LzRunlen) {
+ c %= LzRunlen;
+ while (c--) {
+ fixsize += fixed_llen[*plit];
+ dynsize += llen[*plit++];
+ }
+ } else {
+ fixsize += fixed_llen[(c & 0xff) + Nlit + 1];
+ fixsize += (c >> 8) & 0xff;
+ dynsize += llen[(c & 0xff) + Nlit + 1];
+ dynsize += (c >> 8) & 0xff;
+ plit += lenbase[c & 0xff] + (c >> 16);
+ c = *p++;
+ fixsize += fixed_dlen[c & 0xff];
+ fixsize += (c >> 8) & 0xff;
+ dynsize += dlen[c & 0xff];
+ dynsize += (c >> 8) & 0xff;
+ }
}
- fixsize += fixed_llen[256]; /* EOB */
+ fixsize += fixed_llen[256];
+ dynsize += llen[256];
- outbits(s, s->lastblock, 1);
- if ((long long)fixsize * dynlen > (long long)dynsize * fixlen) {
+ putbits(s, s->lastblock, 1);
+ if (dynsize < fixsize && dynsize < uncsize) {
fprintf(stderr, "dyn ");
- blocklen = dynlen;
- outbits(s, 2, 2);
- outbits(s, nlit - 257, 5);
- outbits(s, ndist - 1, 5);
- outbits(s, nclen - 4, 4);
+ putbits(s, 2, 2);
+ putbits(s, nlit - 257, 5);
+ putbits(s, ndist - 1, 5);
+ putbits(s, nclen - 4, 4);
/* clen code lengths */
for (i = 0; i < nclen; i++)
- outbits(s, clen[clenorder[i]], 3);
+ putbits(s, clen[clenorder[i]], 3);
/* code lengths */
- for (i = 0; i < nsym; i++) {
- sym = syms[i];
- if (sym.type == SymClen)
- outbits(s, ccode[sym.n], clen[sym.n]);
- if (sym.type == SymExtra)
- outbits(s, sym.n, sym.len);
+ for (i = 0; i < nc; i++) {
+ c = codes[i];
+ putbits(s, ccode[c], clen[c]);
+ if (c == 16)
+ putbits(s, extra[i], 2);
+ if (c == 17)
+ putbits(s, extra[i], 3);
+ if (c == 18)
+ putbits(s, extra[i], 7);
}
- /* block data */
- for (i = 0; i < dynlen; i++) {
- sym = getsym(s, i);
- if (sym.type == SymLitlen)
- outbits(s, lcode[sym.n], llen[sym.n]);
- if (sym.type == SymDist)
- outbits(s, dcode[sym.n], dlen[sym.n]);
- if (sym.type == SymExtra)
- outbits(s, sym.n, sym.len);
- }
- /* EOB */
- outbits(s, lcode[256], llen[256]);
- } else {
+ /* block data + eob */
+ putblock(s, lcode, llen, dcode, dlen);
+ } else if (fixsize < uncsize) {
fprintf(stderr, "fix ");
- blocklen = fixlen;
- outbits(s, 1, 2);
- /* block data */
- for (i = 0; i < fixlen; i++) {
- sym = getsym(s, i);
- if (sym.type == SymLitlen)
- outbits(s, fixed_lcode[sym.n], fixed_llen[sym.n]);
- if (sym.type == SymDist)
- outbits(s, fixed_dcode[sym.n], fixed_dlen[sym.n]);
- if (sym.type == SymExtra)
- outbits(s, sym.n, sym.len);
- }
- /* EOB */
- outbits(s, fixed_lcode[256], fixed_llen[256]);
- }
-fprintf(stderr, "symlen: %d dynsize: %d dynlen: %d fixsize: %d fixlen:%d\n", s->symlen, dynsize, dynlen, fixsize, fixlen);
- s->symstart = (s->symstart + blocklen) % SymSize;
- s->symlen -= blocklen;
-}
-
-/* rounded 8 * log2(n) */
-static int log2_8(uint n) {
- int r = 31*8;
-
- /* 8 * int log2() */
- if (n < 1U<<16) n <<= 16, r -= 16*8;
- if (n < 1U<<24) n <<= 8, r -= 8*8;
- if (n < 1U<<28) n <<= 4, r -= 4*8;
- if (n < 1U<<30) n <<= 2, r -= 2*8;
- if (n < 1U<<31) n <<= 1, r -= 1*8;
- /*
- * result is in [r,r+8), to determine the bottom 3 bits:
- * thresholds are 1 << 31 times an odd power of 2^(1/16)
- */
- if (n <= 0xAD583EEAU) {
- if (n <= 0x91C3D373U)
- r += (n <= 0x85AAC367U ? 0 : 1);
- else
- r += (n <= 0x9EF53260U ? 2 : 3);
+ putbits(s, 1, 2);
+ putblock(s, fixed_lcode, fixed_llen, fixed_dcode, fixed_dlen);
} else {
- if (n <= 0xCE248C15U)
- r += (n <= 0xBD08A39FU ? 4 : 5);
- else
- r += (n <= 0xE0CCDEECU ? 6 : n <= 0xF5257D15L ? 7 : 8);
+ putbits(s, 0, 2);
+ putbits(s, 0, 7); /* align to byte boundary */
+ s->nbits = 0;
+ putbits(s, blocklen, 16);
+ putbits(s, ~blocklen, 16);
+ memcpy(s->dst, s->src, blocklen);
}
- return r;
-}
-
-static int nlog(uint n) {
- return n*log2_8(n);
+fprintf(stderr, "blocklen: %d lzlen: %d dynsize: %d fixsize: %d uncsize: %d\n", blocklen, s->lzp - s->lzbuf, dynsize, fixsize, uncsize);
}
-static void chooseblock(State *s) {
- int lfreq[Nlitlen], dfreq[Ndist];
- int ln, dn;
- int len;
- uint bestrate;
- int bestlen, longestlen, besti;
- int extra;
- int hlen;
- int i;
- Sym sym;
-
- hlen = 600 * 8; /* length of encoded huff tree (usually 400..900 bits) */
- extra = 0;
- memset(lfreq, 0, sizeof(lfreq));
- memset(dfreq, 0, sizeof(dfreq));
- lfreq[256] = 1; /* EOB */
- ln = 1;
- dn = 0;
-/*
- for (i = 0; i < s->symlen; i++) {
- sym = getsym(s, i);
- if (sym.type == SymLitlen) {
- lfreq[sym.n]++;
- ln++;
- }
- if (sym.type == SymDist) {
- dfreq[sym.n]++;
- dn++;
- }
- if (sym.type == SymExtra)
- extra += 8 * sym.len;
- }
- len = hlen + nlog(ln) + nlog(dn);
- for (i = 0; i < Nlitlen; i++)
- len -= nlog(lfreq[i]);
- for (i = 0; i < Ndist; i++)
- len -= nlog(dfreq[i]);
-*/
- len = hlen;
- bestlen = longestlen = besti = 0;
- bestrate = len << 9;
- for (i = 0; i < s->symlen; i++) {
- sym = getsym(s, i);
- if (sym.type == SymLitlen) {
- if (sym.n < Nlit) {
- uint rate = ((uint)len << 9)/(ln + dn);
-
- if (rate < bestrate) {
- besti = i;
- bestlen = len;
- bestrate = rate;
- }
- longestlen = i;
- }
- len += nlog(lfreq[sym.n]);
- len -= nlog(ln);
- lfreq[sym.n]++;
- ln++;
- len += nlog(ln);
- len -= nlog(lfreq[sym.n]);
- }
- if (sym.type == SymDist) {
- len += nlog(dfreq[sym.n]);
- len -= nlog(dn);
- dfreq[sym.n]++;
- dn++;
- len += nlog(dn);
- len -= nlog(dfreq[sym.n]);
- }
- }
-fprintf(stderr, "i: %u bits: %u rate: %u full: %u\n", besti, bestlen>>3, bestrate>>5, ((uint)len<<9)/(ln+dn) >> 5);
- block(s, besti, longestlen);
-}
-
-static int count = 0;
-
-static void emitlit(State *s, ushort lit) {
+static void recordlit(State *s, ushort lit) {
/*
fprintf(stderr, "%3d %c\n", lit, lit);
*/
-count++;
- putsym(s, makesym(SymLitlen, 0, lit));
+ s->runlen++;
+ s->lfreq[lit]++;
}
static int bisect(ushort *base, int len, int n) {
@@ -589,27 +453,21 @@ static int bisect(ushort *base, int len, int n) {
return lo - 1;
}
-static void emitmatch(State *s, Match m) {
+static void recordmatch(State *s, Match m) {
+ int i;
/*
fprintf(stderr, "m %d, %d\n", m.dist, m.len);
*/
-count++;
- while (m.len > 0) {
- int len;
- int i;
-
- /* match length can be 3..258 */
- len = m.len > 260 ? 258 : m.len <= 258 ? m.len : m.len - 3;
- m.len -= len;
-
- i = bisect(lenbase, Nlen, len);
- putsym(s, makesym(SymLitlen, 0, Nlit + i + 1));
- putsym(s, makesym(SymExtra, lenbits[i], len - lenbase[i]));
-
- i = bisect(distbase, Ndist, m.dist);
- putsym(s, makesym(SymDist, 0, i));
- putsym(s, makesym(SymExtra, distbits[i], m.dist - distbase[i]));
+ if (s->runlen) {
+ *s->lzp++ = s->runlen | LzRunlen;
+ s->runlen = 0;
}
+ i = bisect(lenbase, Nlen, m.len);
+ *s->lzp++ = i | lenbits[i] << 8 | (m.len - lenbase[i]) << 16;
+ s->lfreq[Nlit + i + 1]++; /* TODO: no +1, lextra, dextra */
+ i = bisect(distbase, Ndist, m.dist);
+ *s->lzp++ = i | distbits[i] << 8 | (m.dist - distbase[i]) << 16;
+ s->dfreq[i]++;
}
static short gethash(uchar *p) {
@@ -637,24 +495,41 @@ static Match getmatch(State *s, uchar *src, int len) {
if (j > m.len) {
m.dist = dist;
m.len = j;
+ if (j > MaxMatch) {
+ m.len = MaxMatch;
+ return m;
+ }
}
pos = s->chain[pos];
}
return m;
}
+static void newblock(State *s, uchar *src) {
+ int i;
+
+ s->src = src;
+ s->lzp = s->lzbuf;
+ s->runlen = 0;
+ for (i = 0; i < Nlitlen; i++)
+ s->lfreq[i] = 0;
+ for (i = 0; i < Ndist; i++)
+ s->dfreq[i] = 0;
+}
+
static void lzcompress(State *s, uchar *src, int len) {
int step;
int hash;
Match m;
+ newblock(s, src);
while (len > 0) {
m = getmatch(s, src, len);
if (m.len >= MinMatch) {
- emitmatch(s, m);
+ recordmatch(s, m);
step = m.len;
} else {
- emitlit(s, *src);
+ recordlit(s, *src);
step = 1;
}
while (step > 0) {
@@ -668,7 +543,20 @@ static void lzcompress(State *s, uchar *src, int len) {
len--;
step--;
}
+ if (src - s->src > 1 << 15) {
+ if (s->runlen)
+ *s->lzp++ = s->runlen | LzRunlen;
+ s->lfreq[256]++; /* EOB */
+ emitblock(s, src - s->src);
+ newblock(s, src);
+ }
}
+ if (s->runlen)
+ *s->lzp++ = s->runlen | LzRunlen;
+ s->lfreq[256]++; /* EOB */
+ s->lastblock = 1;
+ emitblock(s, src - s->src);
+ putbits(s, 0, 7);
}
static void lzinit(State *s) {
@@ -679,7 +567,6 @@ static void lzinit(State *s) {
for (i = 0; i < HashSize; i++)
s->head[i] = 0;
s->pos = 0;
-
for (i = 0; i < 144; i++)
fixed_llen[i] = 8;
for (; i < 256; i++)
@@ -692,25 +579,17 @@ static void lzinit(State *s) {
for (i = 0; i < Ndist; i++)
fixed_dlen[i] = 5;
huffcodes(fixed_dcode, fixed_dlen, Ndist);
+ s->bits = 0;
+ s->nbits = 0;
+ s->lastblock = 0;
}
int deflate(uchar *src, int len, uchar *dst) {
State s;
lzinit(&s);
- s.symstart = 0;
- s.symlen = 0;
- s.bits = 0;
- s.nbits = 0;
- s.lastblock = 0;
s.dst = dst;
lzcompress(&s, src, len);
- chooseblock(&s);
- if (s.symlen > 4000)
- chooseblock(&s);
- s.lastblock = 1;
- block(&s, s.symlen, s.symlen);
- outbits(&s, 0, 7);
return s.dst - dst;
}
@@ -724,7 +603,6 @@ int main() {
uchar *dst;
int len;
-
src = malloc(Siz);
dst = malloc(Siz);
len = fread(src, 1, Siz, stdin);