commit 1e706566a0e7dacd026a988cc2f659e5a6657919
parent 21159561f3409a46a0949d92fc131f18784271b5
Author: nsz <nszabolcs@gmail.com>
Date: Wed, 8 Jul 2009 07:50:28 +0200
deflate_simple simplification wip
Diffstat:
deflate_simple.c | | | 378 | ++++++++++++++++++++++++++++++++----------------------------------------------- |
1 file changed, 155 insertions(+), 223 deletions(-)
diff --git a/deflate_simple.c b/deflate_simple.c
@@ -86,91 +86,6 @@ static uchar clenorder[Nclen] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};
-static short gethash(uchar *p) {
- return (257 * p[0] + 263 * p[1] + 269 * p[2]) % HashSize;
-}
-
-static Match getmatch(State *s, uchar *src, int len) {
- int i, j;
- int pos;
- int dist;
- int lastdist = 0;
- Match m = {0, 0};
-
- if (len >= MinMatch) {
- pos = s->head[gethash(src)];
- for (i = 0; i < MaxChainStep; i++) {
- dist = (WinSize + s->pos - pos) % WinSize;
- if (dist <= lastdist)
- break;
- lastdist = dist;
- for (j = 0; j < len; j++)
- if (src[j] != src[j - dist])
- break;
- if (j > m.len) {
- m.dist = dist;
- m.len = j;
- }
- pos = s->chain[pos];
- }
- if (m.len < MinMatch)
- m.len = 0;
- }
- return m;
-}
-
-static void emitlit(State *s, ushort lit);
-static void emitmatch(State *s, Match m);
-
-static void lzcompress(State *s, uchar *src, int len) {
- int step;
- int hash;
- Match m;
- Match prevm = {0, 0};
- uchar prevc = '\0';
-
- while (len > 0) {
- m = getmatch(s, src, len);
- if (m.len > 0)
- if (prevm.len > 0) {
- if (m.len > prevm.len + 1) {
- emitlit(s, prevc);
- prevm = m;
- prevc = *src;
- step = 1;
- } else {
- emitmatch(s, prevm);
- step = prevm.len - 1;
- prevm.len = 0;
- }
- } else {
- prevm = m;
- prevc = *src;
- step = 1;
- }
- else
- if (prevm.len > 0) {
- emitmatch(s, prevm);
- step = prevm.len - 1;
- prevm.len = 0;
- } else {
- emitlit(s, *src);
- step = 1;
- }
- while (step > 0) {
- if (len >= MinMatch) {
- hash = gethash(src);
- s->chain[s->pos] = s->head[hash];
- s->head[hash] = s->pos;
- s->pos = (s->pos + 1) % WinSize;
- }
- src++;
- len--;
- step--;
- }
- }
-}
-
static uint revcode(uint c, int n) {
int i;
uint r = 0;
@@ -209,173 +124,127 @@ static void huffcodes(ushort *codes, const uchar *lens, int n) {
codes[i] = 0;
}
-/* min-heap for hufflen() */
+/* min-heap for hufflens() */
-typedef struct {
- int k;
- int v;
-} HeapItem;
-
-static int heapparent(int n) {return (n - 1)/2;}
-static int heapleft(int n) {return 2 * n + 1;}
-static int heapright(int n) {return 2 * n + 2;}
-
-static void heappush(HeapItem *heap, int len, int k, int v) {
- int n, p;
- HeapItem tmp;
-
- heap[len].k = k;
- heap[len].v = v;
- n = len;
- while (n > 0) {
- p = heapparent(n);
- if (heap[n].k < heap[p].k) {
- tmp = heap[n];
- heap[n] = heap[p];
- heap[p] = tmp;
- n = p;
+static int heapparent(int n) {return (n - 2)/4 * 2;}
+static int heapchild(int n) {return 2 * n + 2;}
+
+static int heappush(int *heap, int len, int w, int n) {
+ int p, c, tmp;
+
+ c = len;
+ heap[len++] = n;
+ heap[len++] = w;
+ while (c > 0) {
+ p = heapparent(c);
+ if (heap[c+1] < heap[p+1]) {
+ tmp = heap[c]; heap[c] = heap[p]; heap[p] = tmp;
+ tmp = heap[c+1]; heap[c+1] = heap[p+1]; heap[p+1] = tmp;
+ c = p;
} else
break;
}
+ return len;
}
-static void heappop(HeapItem *heap, int len, int *k, int *v) {
- int n, lc, rc, c;
- HeapItem tmp;
+static int heappop(int *heap, int len, int *w, int *n) {
+ int p, c, tmp;
- *k = heap[0].k;
- *v = heap[0].v;
+ *n = heap[0];
+ *w = heap[1];
+ heap[1] = heap[--len];
heap[0] = heap[--len];
- n = 0;
+ p = 0;
for (;;) {
- lc = heapleft(n);
- rc = heapright(n);
- if (lc >= len)
+ c = heapchild(p);
+ if (c >= len)
break;
- else if (rc >= len || heap[lc].k < heap[rc].k)
- c = lc;
- else
- c = rc;
- if (heap[n].k > heap[c].k) {
- tmp = heap[n];
- heap[n] = heap[c];
- heap[c] = tmp;
+ if (c+2 < len && heap[c+3] < heap[c+1])
+ c += 2;
+ if (heap[p+1] > heap[c+1]) {
+ tmp = heap[p]; heap[p] = heap[c]; heap[c] = tmp;
+ tmp = heap[p+1]; heap[p+1] = heap[c+1]; heap[c+1] = tmp;
} else
break;
- n = c;
+ p = c;
}
+ return len;
}
/* symbol frequencies -> code lengths (limited to 255) */
-static void hufflens(uchar *lens, const int *freqs, int nsym) {
+static void hufflens(uchar *lens, int *freqs, int nsym, int limit) {
+ /* 2 <= nsym <= Nlitlen, log(nsym) <= limit <= CodeBits-1 */
int parent[2*Nlitlen-1];
- int clen[2*Nlitlen-1];
- HeapItem heap[Nlitlen];
- int n, len;
+ int count[CodeBits];
+ int heap[2*Nlitlen];
+ int n, len, top, overflow;
int i, j;
- int fi, fj;
+ int wi, wj;
- if (nsym > Nlitlen)
- abort();
+ for (n = 0; n < limit+1; n++)
+ count[n] = 0;
for (len = n = 0; n < nsym; n++)
if (freqs[n] > 0)
- heappush(heap, len++, freqs[n], n);
+ len = heappush(heap, len, freqs[n], n);
+ else
+ lens[n] = 0;
+ /* deflate: fewer than two symbols: add new */
+ for (n = 0; len < 4; n++)
+ if (freqs[n] == 0)
+ len = heappush(heap, len, ++freqs[n], n);
/* build code tree */
- while (len > 1) {
- heappop(heap, len--, &fi, &i);
- heappop(heap, len--, &fj, &j);
+ top = len;
+ for (n = nsym; len > 2; n++) {
+ len = heappop(heap, len, &wi, &i);
+ len = heappop(heap, len, &wj, &j);
parent[i] = n;
parent[j] = n;
- heappush(heap, len++, fi + fj, n++);
+ len = heappush(heap, len, wi + wj, n);
+ /* keep an ordered list of nodes at the end */
+ heap[len+1] = i;
+ heap[len] = j;
}
- /* calc code length */
- clen[--n] = 0;
- while (n--)
- if (n >= nsym || freqs[n])
- clen[n] = 1 + clen[parent[n]];
- else
- clen[n] = 0;
- /* limit code length to 255 */
- for (n = 0; n < nsym; n++)
- lens[n] = clen[n] > 255 ? 255 : clen[n];
-}
-
-/* F_n if n > 0 */
-static int fib(int n) {
- int a, b, tmp;
-
- for (a = 0, b = 1; n > 1; n--) {
- tmp = a;
- a = b;
- b += tmp;
- }
- return b;
-}
-
-/* symbol frequencies -> (limited) code lengths */
-static void hufflenslimit(uchar *lens, int *freqs, int nsym, int limit) {
- uint minfreq, sumfreq;
- int i, adjust, maxprob, count;
-
- /* TODO: fewer than two symbols: add new */
- if (nsym < 2)
- abort();
- count = 0;
- for (i = 0; i < nsym; i++)
- if (freqs[i] > 0)
- count++;
- if (count < 2)
- for (i = 0; count > 0 && i < nsym; i++)
- if (freqs[i] == 0) {
- freqs[i] = 1;
- count--; /* what? */
+ /* calc code lengths (deflate: with limit) */
+ overflow = 0;
+ parent[--n] = 0;
+ for (i = 2; i < top; i++) {
+ n = heap[i];
+ if (n >= nsym) {
+ /* overwrite parent index with length */
+ parent[n] = parent[parent[n]] + 1;
+ if (parent[n] > limit)
+ overflow++;
+ } else {
+ lens[n] = parent[parent[n]] + 1;
+ if (lens[n] > limit) {
+ lens[n] = limit;
+ overflow++;
}
- /* try building the codes without limit */
- hufflens(lens, freqs, nsym);
- for (i = 0; i < nsym; i++)
- if (lens[i] > limit)
- break;
- if (i == nsym)
+ count[lens[n]]++;
+ }
+ }
+ if (overflow == 0)
return;
- /*
- * code length can be N bits or more only if the probability of a
- * symbol is less than the reciprocal of the (N+2)th fibonacci number
- *
- * if hufflens() returned a long code, we add a constant to all the
- * symbol frequencies and call hufflens() again
- */
- maxprob = fib(limit + 3);
- sumfreq = 0;
- minfreq = (uint)-1;
- for (i = 0; i < nsym; i++)
- if (freqs[i]) {
- if (minfreq > freqs[i])
- minfreq = freqs[i];
- sumfreq += freqs[i];
+ /* modify code tree to fix overflow (from zlib) */
+ while (overflow > 0) {
+ for (n = limit-1; count[n] == 0; n--);
+ count[n]--;
+ count[n+1] += 2;
+ count[limit]--;
+ overflow -= 2;
+ }
+ for (len = limit; len > 0; len--)
+ for (i = count[len]; i > 0;) {
+ n = heap[--top];
+ if (n < nsym) {
+ lens[n] = len;
+ i--;
+ }
}
- if (minfreq > sumfreq / maxprob)
- abort();
- /*
- * adjust is the smallest integer such that
- * (sumfreq + count * adjust)/(minfreq + adjust) < maxprob
- */
- i = maxprob - count;
- adjust = (sumfreq - minfreq * maxprob + i - 1) / i;
- for (i = 0; i < nsym; i++)
- if (freqs[i])
- freqs[i] += adjust;
- /* rebuild the hufftree */
- hufflens(lens, freqs, nsym);
- for (i = 0; i < nsym; i++)
- if (lens[i] > limit)
- abort();
}
/* adding < 16 bits to dst */
static void outbits(State *s, uint bits, int nbits) {
-/*
-fprintf(stderr, "b:%3d n:%3d\n", bits, nbits);
-*/
s->bits |= bits << s->nbits;
s->nbits += nbits;
while (s->nbits >= 8) {
@@ -484,8 +353,8 @@ static void block(State *s, int dynlen, int fixlen) {
else if (sym.type == SymDist)
dfreq[sym.n]++;
}
- hufflenslimit(llen, lfreq, Nlitlen, CodeBits-1);
- hufflenslimit(dlen, dfreq, Ndist, CodeBits-1);
+ hufflens(llen, lfreq, Nlitlen, CodeBits-1);
+ hufflens(dlen, dfreq, Ndist, CodeBits-1);
huffcodes(lcode, llen, Nlitlen);
huffcodes(dcode, dlen, Ndist);
for (nlit = Nlitlen; nlit > Nlit && llen[nlit-1] == 0; nlit--);
@@ -498,7 +367,7 @@ static void block(State *s, int dynlen, int fixlen) {
if (sym.type == SymClen)
cfreq[sym.n]++;
}
- hufflenslimit(clen, cfreq, Nclen, 7);
+ hufflens(clen, cfreq, Nclen, 7);
huffcodes(ccode, clen, Nclen);
for (nclen = Nclen; nclen > 4 && clen[clenorder[nclen-1]] == 0; nclen--);
@@ -743,6 +612,65 @@ count++;
}
}
+static short gethash(uchar *p) {
+ return (257 * p[0] + 263 * p[1] + 269 * p[2]) % HashSize;
+}
+
+static Match getmatch(State *s, uchar *src, int len) {
+ int i, j;
+ int pos;
+ int dist;
+ int lastdist = 0;
+ Match m = {0, 0};
+
+ if (len < MinMatch)
+ return m;
+ pos = s->head[gethash(src)];
+ for (i = 0; i < MaxChainStep; i++) {
+ dist = (WinSize + s->pos - pos) % WinSize;
+ if (dist <= lastdist)
+ break;
+ lastdist = dist;
+ for (j = 0; j < len; j++)
+ if (src[j] != src[j - dist])
+ break;
+ if (j > m.len) {
+ m.dist = dist;
+ m.len = j;
+ }
+ pos = s->chain[pos];
+ }
+ return m;
+}
+
+static void lzcompress(State *s, uchar *src, int len) {
+ int step;
+ int hash;
+ Match m;
+
+ while (len > 0) {
+ m = getmatch(s, src, len);
+ if (m.len >= MinMatch) {
+ emitmatch(s, m);
+ step = m.len;
+ } else {
+ emitlit(s, *src);
+ step = 1;
+ }
+ while (step > 0) {
+ if (len >= MinMatch) {
+ hash = gethash(src);
+ s->chain[s->pos] = s->head[hash];
+ s->head[hash] = s->pos;
+ s->pos = (s->pos + 1) % WinSize;
+ }
+ src++;
+ len--;
+ step--;
+ }
+ }
+}
+
static void lzinit(State *s) {
int i;
@@ -796,11 +724,15 @@ int main() {
uchar *dst;
int len;
+
src = malloc(Siz);
dst = malloc(Siz);
len = fread(src, 1, Siz, stdin);
len = deflate(src, len, dst);
fwrite(dst, 1, len, stdout);
fprintf(stderr, "compressed: %d\n", len);
+ free(dst);
+ free(src);
+
return 0;
}