2019-07-03 00:08:06 +08:00
|
|
|
import { BufferCursor } from "../../BufferCursor";
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
export function compress(src: BufferCursor): BufferCursor {
|
2019-05-29 00:40:29 +08:00
|
|
|
const ctx = new Context(src);
|
2019-06-26 23:21:05 +08:00
|
|
|
const hash_table = new HashTable();
|
2019-05-29 00:40:29 +08:00
|
|
|
|
|
|
|
if (ctx.src.size <= 3) {
|
|
|
|
// Make a literal copy of the input.
|
2019-06-26 23:21:05 +08:00
|
|
|
while (ctx.src.bytes_left) {
|
|
|
|
ctx.set_bit(1);
|
|
|
|
ctx.copy_literal();
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Add the first two "strings" to the hash table.
|
2019-06-26 23:21:05 +08:00
|
|
|
hash_table.put(hash_table.hash(ctx.src), 0);
|
2019-05-29 00:40:29 +08:00
|
|
|
ctx.src.seek(1);
|
2019-06-26 23:21:05 +08:00
|
|
|
hash_table.put(hash_table.hash(ctx.src), 1);
|
2019-05-29 00:40:29 +08:00
|
|
|
ctx.src.seek(-1);
|
|
|
|
|
|
|
|
// Copy the first two bytes as literals.
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.set_bit(1);
|
|
|
|
ctx.copy_literal();
|
|
|
|
ctx.set_bit(1);
|
|
|
|
ctx.copy_literal();
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
while (ctx.src.bytes_left > 1) {
|
|
|
|
let [offset, mlen] = ctx.find_longest_match(hash_table, false);
|
2019-05-29 00:40:29 +08:00
|
|
|
|
|
|
|
if (mlen > 0) {
|
|
|
|
ctx.src.seek(1);
|
2019-06-26 23:21:05 +08:00
|
|
|
const [offset2, mlen2] = ctx.find_longest_match(hash_table, true);
|
2019-05-29 00:40:29 +08:00
|
|
|
ctx.src.seek(-1);
|
|
|
|
|
|
|
|
// Did the "lazy match" produce something more compressed?
|
|
|
|
if (mlen2 > mlen) {
|
2019-06-26 23:21:05 +08:00
|
|
|
let copy_literal = true;
|
2019-05-29 00:40:29 +08:00
|
|
|
// Check if it is a good idea to switch from a short match to a long one.
|
|
|
|
if (mlen >= 2 && mlen <= 5 && offset2 < offset) {
|
|
|
|
if (offset >= -256 && offset2 < -256) {
|
|
|
|
if (mlen2 - mlen < 3) {
|
2019-06-26 23:21:05 +08:00
|
|
|
copy_literal = false;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
if (copy_literal) {
|
|
|
|
ctx.set_bit(1);
|
|
|
|
ctx.copy_literal();
|
2019-05-29 00:40:29 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// What kind of match did we find?
|
|
|
|
if (mlen >= 2 && mlen <= 5 && offset >= -256) {
|
|
|
|
// Short match.
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.set_bit(0);
|
|
|
|
ctx.set_bit(0);
|
|
|
|
ctx.set_bit((mlen - 2) & 0x02);
|
|
|
|
ctx.set_bit((mlen - 2) & 0x01);
|
2019-07-03 00:08:06 +08:00
|
|
|
ctx.write_literal(offset & 0xff);
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.add_intermediates(hash_table, mlen);
|
2019-05-29 00:40:29 +08:00
|
|
|
continue;
|
|
|
|
} else if (mlen >= 3 && mlen <= 9) {
|
|
|
|
// Long match, short length.
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.set_bit(0);
|
|
|
|
ctx.set_bit(1);
|
2019-07-03 00:08:06 +08:00
|
|
|
ctx.write_literal(((offset & 0x1f) << 3) | ((mlen - 2) & 0x07));
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.write_literal(offset >> 5);
|
|
|
|
ctx.add_intermediates(hash_table, mlen);
|
2019-05-29 00:40:29 +08:00
|
|
|
continue;
|
|
|
|
} else if (mlen > 9) {
|
|
|
|
// Long match, long length.
|
|
|
|
if (mlen > 256) {
|
|
|
|
mlen = 256;
|
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.set_bit(0);
|
|
|
|
ctx.set_bit(1);
|
2019-07-03 00:08:06 +08:00
|
|
|
ctx.write_literal((offset & 0x1f) << 3);
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.write_literal(offset >> 5);
|
|
|
|
ctx.write_literal(mlen - 1);
|
|
|
|
ctx.add_intermediates(hash_table, mlen);
|
2019-05-29 00:40:29 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we get here, we didn't find a suitable match, so just we just make a literal copy.
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.set_bit(1);
|
|
|
|
ctx.copy_literal();
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// If there's a left over byte at the end, make a literal copy.
|
2019-06-26 23:21:05 +08:00
|
|
|
if (ctx.src.bytes_left) {
|
|
|
|
ctx.set_bit(1);
|
|
|
|
ctx.copy_literal();
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
ctx.write_eof();
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
return ctx.dst.seek_start(0);
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
const MAX_WINDOW = 0x2000;
|
|
|
|
const WINDOW_MASK = MAX_WINDOW - 1;
|
|
|
|
const HASH_SIZE = 1 << 8;
|
|
|
|
|
|
|
|
class Context {
|
2019-06-26 23:21:05 +08:00
|
|
|
src: BufferCursor;
|
|
|
|
dst: BufferCursor;
|
2019-05-29 00:40:29 +08:00
|
|
|
flags: number;
|
2019-06-26 23:21:05 +08:00
|
|
|
flag_bits_left: number;
|
|
|
|
flag_offset: number;
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
constructor(cursor: BufferCursor) {
|
2019-05-29 00:40:29 +08:00
|
|
|
this.src = cursor;
|
2019-06-26 23:21:05 +08:00
|
|
|
this.dst = new BufferCursor(cursor.size, cursor.little_endian);
|
2019-05-29 00:40:29 +08:00
|
|
|
this.flags = 0;
|
2019-06-26 23:21:05 +08:00
|
|
|
this.flag_bits_left = 0;
|
|
|
|
this.flag_offset = 0;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
set_bit(bit: number): void {
|
|
|
|
if (!this.flag_bits_left--) {
|
2019-05-29 00:40:29 +08:00
|
|
|
// Write out the flags to their position in the file, and store the next flags byte position.
|
|
|
|
const pos = this.dst.position;
|
|
|
|
this.dst
|
2019-06-26 23:21:05 +08:00
|
|
|
.seek_start(this.flag_offset)
|
|
|
|
.write_u8(this.flags)
|
|
|
|
.seek_start(pos)
|
|
|
|
.write_u8(0); // Placeholder for the next flags byte.
|
|
|
|
this.flag_offset = pos;
|
|
|
|
this.flag_bits_left = 7;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
this.flags >>>= 1;
|
|
|
|
|
|
|
|
if (bit) {
|
|
|
|
this.flags |= 0x80;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
copy_literal(): void {
|
|
|
|
this.dst.write_u8(this.src.u8());
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
write_literal(value: number): void {
|
|
|
|
this.dst.write_u8(value);
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
2019-05-29 07:37:00 +08:00
|
|
|
writeFinalFlags(): void {
|
2019-06-26 23:21:05 +08:00
|
|
|
this.flags >>>= this.flag_bits_left;
|
2019-05-29 00:40:29 +08:00
|
|
|
const pos = this.dst.position;
|
|
|
|
this.dst
|
2019-06-26 23:21:05 +08:00
|
|
|
.seek_start(this.flag_offset)
|
|
|
|
.write_u8(this.flags)
|
|
|
|
.seek_start(pos);
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
write_eof(): void {
|
|
|
|
this.set_bit(0);
|
|
|
|
this.set_bit(1);
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-05-29 07:37:00 +08:00
|
|
|
this.writeFinalFlags();
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
this.write_literal(0);
|
|
|
|
this.write_literal(0);
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
match_length(s2: number): number {
|
|
|
|
const array = this.src.uint8_array_view();
|
2019-05-29 00:40:29 +08:00
|
|
|
let len = 0;
|
|
|
|
let s1 = this.src.position;
|
|
|
|
|
|
|
|
while (s1 < array.byteLength && array[s1] === array[s2]) {
|
|
|
|
++len;
|
|
|
|
++s1;
|
|
|
|
++s2;
|
|
|
|
}
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
find_longest_match(hash_table: HashTable, lazy: boolean): [number, number] {
|
|
|
|
if (!this.src.bytes_left) {
|
2019-05-29 00:40:29 +08:00
|
|
|
return [0, 0];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Figure out where we're looking.
|
2019-06-26 23:21:05 +08:00
|
|
|
const hash = hash_table.hash(this.src);
|
2019-05-29 00:40:29 +08:00
|
|
|
|
|
|
|
// If there is nothing in the table at that point, bail out now.
|
2019-06-26 23:21:05 +08:00
|
|
|
let entry = hash_table.get(hash);
|
2019-05-29 00:40:29 +08:00
|
|
|
|
|
|
|
if (entry === null) {
|
|
|
|
if (!lazy) {
|
2019-06-26 23:21:05 +08:00
|
|
|
hash_table.put(hash, this.src.position);
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return [0, 0];
|
|
|
|
}
|
|
|
|
|
2019-07-03 00:08:06 +08:00
|
|
|
// If we'd go outside the window, truncate the hash chain now.
|
2019-05-29 00:40:29 +08:00
|
|
|
if (this.src.position - entry > MAX_WINDOW) {
|
2019-06-26 23:21:05 +08:00
|
|
|
hash_table.hash_to_offset[hash] = null;
|
2019-05-29 00:40:29 +08:00
|
|
|
|
|
|
|
if (!lazy) {
|
2019-06-26 23:21:05 +08:00
|
|
|
hash_table.put(hash, this.src.position);
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return [0, 0];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ok, we have something in the hash table that matches the hash value.
|
|
|
|
// Follow the chain to see if we have an actual string match, and find the longest match.
|
2019-06-26 23:21:05 +08:00
|
|
|
let longest_length = 0;
|
|
|
|
let longest_match = 0;
|
2019-05-29 00:40:29 +08:00
|
|
|
|
|
|
|
while (entry != null) {
|
2019-06-26 23:21:05 +08:00
|
|
|
const mlen = this.match_length(entry);
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
if (mlen > longest_length || mlen >= 256) {
|
|
|
|
longest_length = mlen;
|
|
|
|
longest_match = entry;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Follow the chain, making sure not to exceed a difference of MAX_WINDOW.
|
2019-06-26 23:21:05 +08:00
|
|
|
let entry_2 = hash_table.prev(entry);
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
if (entry_2 !== null) {
|
2019-05-29 00:40:29 +08:00
|
|
|
// If we'd go outside the window, truncate the hash chain now.
|
2019-06-26 23:21:05 +08:00
|
|
|
if (this.src.position - entry_2 > MAX_WINDOW) {
|
|
|
|
hash_table.set_prev(entry, null);
|
|
|
|
entry_2 = null;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
entry = entry_2;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add our current string to the hash.
|
|
|
|
if (!lazy) {
|
2019-06-26 23:21:05 +08:00
|
|
|
hash_table.put(hash, this.src.position);
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Did we find a match?
|
2019-06-26 23:21:05 +08:00
|
|
|
const offset = longest_length > 0 ? longest_match - this.src.position : 0;
|
|
|
|
return [offset, longest_length];
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
add_intermediates(hash_table: HashTable, len: number): void {
|
2019-05-29 00:40:29 +08:00
|
|
|
this.src.seek(1);
|
|
|
|
|
|
|
|
for (let i = 1; i < len; ++i) {
|
2019-06-26 23:21:05 +08:00
|
|
|
const hash = hash_table.hash(this.src);
|
|
|
|
hash_table.put(hash, this.src.position);
|
2019-05-29 00:40:29 +08:00
|
|
|
this.src.seek(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
class HashTable {
|
2019-07-03 02:56:33 +08:00
|
|
|
hash_to_offset: (number | null)[] = new Array(HASH_SIZE).fill(null);
|
|
|
|
masked_offset_to_prev: (number | null)[] = new Array(MAX_WINDOW).fill(null);
|
2019-05-29 00:40:29 +08:00
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
hash(cursor: BufferCursor): number {
|
2019-05-29 00:40:29 +08:00
|
|
|
let hash = cursor.u8();
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
if (cursor.bytes_left) {
|
2019-05-29 00:40:29 +08:00
|
|
|
hash ^= cursor.u8();
|
|
|
|
cursor.seek(-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
cursor.seek(-1);
|
|
|
|
return hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
get(hash: number): number | null {
|
2019-06-26 23:21:05 +08:00
|
|
|
return this.hash_to_offset[hash];
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
put(hash: number, offset: number): void {
|
2019-06-26 23:21:05 +08:00
|
|
|
this.set_prev(offset, this.hash_to_offset[hash]);
|
|
|
|
this.hash_to_offset[hash] = offset;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
prev(offset: number): number | null {
|
2019-06-26 23:21:05 +08:00
|
|
|
return this.masked_offset_to_prev[offset & WINDOW_MASK];
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
|
2019-06-26 23:21:05 +08:00
|
|
|
set_prev(offset: number, prevOffset: number | null): void {
|
|
|
|
this.masked_offset_to_prev[offset & WINDOW_MASK] = prevOffset;
|
2019-05-29 00:40:29 +08:00
|
|
|
}
|
|
|
|
}
|