.

2026-05-23 14:55:42 +00:00 · 2025-10-21 23:45:13 +07:00
parent 6c387b420c
commit bb60e987e5
3548 changed files with 4952576 additions and 116 deletions
@@ -0,0 +1,45 @@
+package matchfinder
+
+// An absoluteMatch is like a Match, but it stores indexes into the byte
+// stream instead of lengths.
+type absoluteMatch struct {
+	// Start is the index of the first byte.
+	Start int
+
+	// End is the index of the byte after the last byte
+	// (so that End - Start = Length).
+	End int
+
+	// Match is the index of the previous data that matches
+	// (Start - Match = Distance).
+	Match int
+}
+
+// A matchEmitter manages the output of matches for a MatchFinder.
+type matchEmitter struct {
+	// Dst is the destination slice that Matches are added to.
+	Dst []Match
+
+	// NextEmit is the index of the next byte to emit.
+	NextEmit int
+}
+
+func (e *matchEmitter) emit(m absoluteMatch) {
+	e.Dst = append(e.Dst, Match{
+		Unmatched: m.Start - e.NextEmit,
+		Length:    m.End - m.Start,
+		Distance:  m.Start - m.Match,
+	})
+	e.NextEmit = m.End
+}
+
+// trim shortens m if it extends past maxEnd. Then if the length is at least
+// minLength, the match is emitted.
+func (e *matchEmitter) trim(m absoluteMatch, maxEnd int, minLength int) {
+	if m.End > maxEnd {
+		m.End = maxEnd
+	}
+	if m.End-m.Start >= minLength {
+		e.emit(m)
+	}
+}
@@ -0,0 +1,169 @@
+package matchfinder
+
+import (
+	"encoding/binary"
+)
+
+// M0 is an implementation of the MatchFinder interface based
+// on the algorithm used by snappy, but modified to be more like the algorithm
+// used by compression level 0 of the brotli reference implementation.
+//
+// It has a maximum block size of 65536 bytes.
+type M0 struct {
+	// Lazy turns on "lazy matching," for higher compression but less speed.
+	Lazy bool
+
+	MaxDistance int
+	MaxLength   int
+}
+
+func (M0) Reset() {}
+
+const (
+	m0HashLen = 5
+
+	m0TableBits = 14
+	m0TableSize = 1 << m0TableBits
+	m0Shift     = 32 - m0TableBits
+	// m0TableMask is redundant, but helps the compiler eliminate bounds
+	// checks.
+	m0TableMask = m0TableSize - 1
+)
+
+func (m M0) hash(data uint64) uint64 {
+	hash := (data << (64 - 8*m0HashLen)) * hashMul64
+	return hash >> (64 - m0TableBits)
+}
+
+// FindMatches looks for matches in src, appends them to dst, and returns dst.
+// src must not be longer than 65536 bytes.
+func (m M0) FindMatches(dst []Match, src []byte) []Match {
+	const inputMargin = 16 - 1
+	const minNonLiteralBlockSize = 1 + 1 + inputMargin
+
+	if len(src) < minNonLiteralBlockSize {
+		dst = append(dst, Match{
+			Unmatched: len(src),
+		})
+		return dst
+	}
+	if len(src) > 65536 {
+		panic("block too long")
+	}
+
+	var table [m0TableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := m.hash(binary.LittleEndian.Uint64(src[s:]))
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&m0TableMask])
+			table[nextHash&m0TableMask] = uint16(s)
+			nextHash = m.hash(binary.LittleEndian.Uint64(src[nextS:]))
+			if m.MaxDistance != 0 && s-candidate > m.MaxDistance {
+				continue
+			}
+			if binary.LittleEndian.Uint32(src[s:]) == binary.LittleEndian.Uint32(src[candidate:]) {
+				break
+			}
+		}
+
+		// Invariant: we have a 4-byte match at s.
+		base := s
+		s = extendMatch(src, candidate+4, s+4)
+
+		origBase := base
+		if m.Lazy && base+1 < sLimit {
+			newBase := base + 1
+			h := m.hash(binary.LittleEndian.Uint64(src[newBase:]))
+			newCandidate := int(table[h&m0TableMask])
+			table[h&m0TableMask] = uint16(newBase)
+			okDistance := true
+			if m.MaxDistance != 0 && newBase-newCandidate > m.MaxDistance {
+				okDistance = false
+			}
+			if okDistance && binary.LittleEndian.Uint32(src[newBase:]) == binary.LittleEndian.Uint32(src[newCandidate:]) {
+				newS := extendMatch(src, newCandidate+4, newBase+4)
+				if newS-newBase > s-base+1 {
+					s = newS
+					base = newBase
+					candidate = newCandidate
+				}
+			}
+		}
+
+		if m.MaxLength != 0 && s-base > m.MaxLength {
+			s = base + m.MaxLength
+		}
+		dst = append(dst, Match{
+			Unmatched: base - nextEmit,
+			Length:    s - base,
+			Distance:  base - candidate,
+		})
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if m.Lazy {
+			// If lazy matching is enabled, we update the hash table for
+			// every byte in the match.
+			for i := origBase + 2; i < s-1; i++ {
+				x := binary.LittleEndian.Uint64(src[i:])
+				table[m.hash(x)&m0TableMask] = uint16(i)
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := binary.LittleEndian.Uint64(src[s-1:])
+		prevHash := m.hash(x >> 0)
+		table[prevHash&m0TableMask] = uint16(s - 1)
+		nextHash = m.hash(x >> 8)
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		dst = append(dst, Match{
+			Unmatched: len(src) - nextEmit,
+		})
+	}
+	return dst
+}
@@ -0,0 +1,297 @@
+package matchfinder
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"runtime"
+)
+
+// M4 is an implementation of the MatchFinder
+// interface that uses a hash table to find matches,
+// optional match chains,
+// and the advanced parsing technique from
+// https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html.
+type M4 struct {
+	// MaxDistance is the maximum distance (in bytes) to look back for
+	// a match. The default is 65535.
+	MaxDistance int
+
+	// MinLength is the length of the shortest match to return.
+	// The default is 4.
+	MinLength int
+
+	// HashLen is the number of bytes to use to calculate the hashes.
+	// The maximum is 8 and the default is 6.
+	HashLen int
+
+	// TableBits is the number of bits in the hash table indexes.
+	// The default is 17 (128K entries).
+	TableBits int
+
+	// ChainLength is how many entries to search on the "match chain" of older
+	// locations with the same hash as the current location.
+	ChainLength int
+
+	// DistanceBitCost is used when comparing two matches to see
+	// which is better. The comparison is primarily based on the length
+	// of the matches, but it can also take the distance into account,
+	// in terms of the number of bits needed to represent the distance.
+	// One byte of length is given a score of 256, so 32 (256/8) would
+	// be a reasonable first guess for the value of one bit.
+	// (The default is 0, which bases the comparison solely on length.)
+	DistanceBitCost int
+
+	table []uint32
+	chain []uint16
+
+	history []byte
+}
+
+func (q *M4) Reset() {
+	for i := range q.table {
+		q.table[i] = 0
+	}
+	q.history = q.history[:0]
+	q.chain = q.chain[:0]
+}
+
+func (q *M4) score(m absoluteMatch) int {
+	return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost
+}
+
+func (q *M4) FindMatches(dst []Match, src []byte) []Match {
+	if q.MaxDistance == 0 {
+		q.MaxDistance = 65535
+	}
+	if q.MinLength == 0 {
+		q.MinLength = 4
+	}
+	if q.HashLen == 0 {
+		q.HashLen = 6
+	}
+	if q.TableBits == 0 {
+		q.TableBits = 17
+	}
+	if len(q.table) < 1<<q.TableBits {
+		q.table = make([]uint32, 1<<q.TableBits)
+	}
+
+	e := matchEmitter{Dst: dst}
+
+	if len(q.history) > q.MaxDistance*2 {
+		// Trim down the history buffer.
+		delta := len(q.history) - q.MaxDistance
+		copy(q.history, q.history[delta:])
+		q.history = q.history[:q.MaxDistance]
+		if q.ChainLength > 0 {
+			q.chain = q.chain[:q.MaxDistance]
+		}
+
+		for i, v := range q.table {
+			newV := int(v) - delta
+			if newV < 0 {
+				newV = 0
+			}
+			q.table[i] = uint32(newV)
+		}
+	}
+
+	// Append src to the history buffer.
+	e.NextEmit = len(q.history)
+	q.history = append(q.history, src...)
+	if q.ChainLength > 0 {
+		q.chain = append(q.chain, make([]uint16, len(src))...)
+	}
+	src = q.history
+
+	// matches stores the matches that have been found but not emitted,
+	// in reverse order. (matches[0] is the most recent one.)
+	var matches [3]absoluteMatch
+	for i := e.NextEmit; i < len(src)-7; i++ {
+		if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
+			// We have found some matches, and we're far enough along that we probably
+			// won't find overlapping matches, so we might as well emit them.
+			if matches[1] != (absoluteMatch{}) {
+				e.trim(matches[1], matches[0].Start, q.MinLength)
+			}
+			e.emit(matches[0])
+			matches = [3]absoluteMatch{}
+		}
+
+		// Calculate and store the hash.
+		h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits)
+		candidate := int(q.table[h])
+		q.table[h] = uint32(i)
+		if q.ChainLength > 0 && candidate != 0 {
+			delta := i - candidate
+			if delta < 1<<16 {
+				q.chain[i] = uint16(delta)
+			}
+		}
+
+		if i < matches[0].End && i != matches[0].End+2-q.HashLen {
+			continue
+		}
+		if candidate == 0 || i-candidate > q.MaxDistance {
+			continue
+		}
+
+		// Look for a match.
+		var currentMatch absoluteMatch
+
+		if i-candidate != matches[0].Start-matches[0].Match {
+			if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
+				m := extendMatch2(src, i, candidate, e.NextEmit)
+				if m.End-m.Start > q.MinLength {
+					currentMatch = m
+				}
+			}
+		}
+
+		for j := 0; j < q.ChainLength; j++ {
+			delta := q.chain[candidate]
+			if delta == 0 {
+				break
+			}
+			candidate -= int(delta)
+			if candidate <= 0 || i-candidate > q.MaxDistance {
+				break
+			}
+			if i-candidate != matches[0].Start-matches[0].Match {
+				if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
+					m := extendMatch2(src, i, candidate, e.NextEmit)
+					if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) {
+						currentMatch = m
+					}
+				}
+			}
+		}
+
+		if currentMatch.End-currentMatch.Start < q.MinLength {
+			continue
+		}
+
+		overlapPenalty := 0
+		if matches[0] != (absoluteMatch{}) {
+			overlapPenalty = 275
+			if currentMatch.Start <= matches[1].End {
+				// This match would completely replace the previous match,
+				// so there is no penalty for overlap.
+				overlapPenalty = 0
+			}
+		}
+
+		if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty {
+			continue
+		}
+
+		matches = [3]absoluteMatch{
+			currentMatch,
+			matches[0],
+			matches[1],
+		}
+
+		if matches[2] == (absoluteMatch{}) {
+			continue
+		}
+
+		// We have three matches, so it's time to emit one and/or eliminate one.
+		switch {
+		case matches[0].Start < matches[2].End:
+			// The first and third matches overlap; discard the one in between.
+			matches = [3]absoluteMatch{
+				matches[0],
+				matches[2],
+				absoluteMatch{},
+			}
+
+		case matches[0].Start < matches[2].End+q.MinLength:
+			// The first and third matches don't overlap, but there's no room for
+			// another match between them. Emit the first match and discard the second.
+			e.emit(matches[2])
+			matches = [3]absoluteMatch{
+				matches[0],
+				absoluteMatch{},
+				absoluteMatch{},
+			}
+
+		default:
+			// Emit the first match, shortening it if necessary to avoid overlap with the second.
+			e.trim(matches[2], matches[1].Start, q.MinLength)
+			matches[2] = absoluteMatch{}
+		}
+	}
+
+	// We've found all the matches now; emit the remaining ones.
+	if matches[1] != (absoluteMatch{}) {
+		e.trim(matches[1], matches[0].Start, q.MinLength)
+	}
+	if matches[0] != (absoluteMatch{}) {
+		e.emit(matches[0])
+	}
+
+	dst = e.Dst
+	if e.NextEmit < len(src) {
+		dst = append(dst, Match{
+			Unmatched: len(src) - e.NextEmit,
+		})
+	}
+
+	return dst
+}
+
+const hashMul64 = 0x1E35A7BD1E35A7BD
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	switch runtime.GOARCH {
+	case "amd64":
+		// As long as we are 8 or more bytes before the end of src, we can load and
+		// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+		for j+8 < len(src) {
+			iBytes := binary.LittleEndian.Uint64(src[i:])
+			jBytes := binary.LittleEndian.Uint64(src[j:])
+			if iBytes != jBytes {
+				// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+				// the index of the first byte that differs. The BSF instruction finds the
+				// least significant 1 bit, the amd64 architecture is little-endian, and
+				// the shift by 3 converts a bit index to a byte index.
+				return j + bits.TrailingZeros64(iBytes^jBytes)>>3
+			}
+			i, j = i+8, j+8
+		}
+	case "386":
+		// On a 32-bit CPU, we do it 4 bytes at a time.
+		for j+4 < len(src) {
+			iBytes := binary.LittleEndian.Uint32(src[i:])
+			jBytes := binary.LittleEndian.Uint32(src[j:])
+			if iBytes != jBytes {
+				return j + bits.TrailingZeros32(iBytes^jBytes)>>3
+			}
+			i, j = i+4, j+4
+		}
+	}
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
+
+// Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it
+// upward as far as possible, and downward no farther than to min.
+func extendMatch2(src []byte, start, candidate, min int) absoluteMatch {
+	end := extendMatch(src, candidate+4, start+4)
+	for start > min && candidate > 0 && src[start-1] == src[candidate-1] {
+		start--
+		candidate--
+	}
+	return absoluteMatch{
+		Start: start,
+		End:   end,
+		Match: candidate,
+	}
+}
@@ -0,0 +1,103 @@
+// The matchfinder package defines reusable components for data compression.
+//
+// Many compression libraries have two main parts:
+//   - Something that looks for repeated sequences of bytes
+//   - An encoder for the compressed data format (often an entropy coder)
+//
+// Although these are logically two separate steps, the implementations are
+// usually closely tied together. You can't use flate's matcher with snappy's
+// encoder, for example. This package defines interfaces and an intermediate
+// representation to allow mixing and matching compression components.
+package matchfinder
+
+import "io"
+
+// A Match is the basic unit of LZ77 compression.
+type Match struct {
+	Unmatched int // the number of unmatched bytes since the previous match
+	Length    int // the number of bytes in the matched string; it may be 0 at the end of the input
+	Distance  int // how far back in the stream to copy from
+}
+
+// A MatchFinder performs the LZ77 stage of compression, looking for matches.
+type MatchFinder interface {
+	// FindMatches looks for matches in src, appends them to dst, and returns dst.
+	FindMatches(dst []Match, src []byte) []Match
+
+	// Reset clears any internal state, preparing the MatchFinder to be used with
+	// a new stream.
+	Reset()
+}
+
+// An Encoder encodes the data in its final format.
+type Encoder interface {
+	// Encode appends the encoded format of src to dst, using the match
+	// information from matches.
+	Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte
+
+	// Reset clears any internal state, preparing the Encoder to be used with
+	// a new stream.
+	Reset()
+}
+
+// A Writer uses MatchFinder and Encoder to write compressed data to Dest.
+type Writer struct {
+	Dest        io.Writer
+	MatchFinder MatchFinder
+	Encoder     Encoder
+
+	// BlockSize is the number of bytes to compress at a time. If it is zero,
+	// each Write operation will be treated as one block.
+	BlockSize int
+
+	err     error
+	inBuf   []byte
+	outBuf  []byte
+	matches []Match
+}
+
+func (w *Writer) Write(p []byte) (n int, err error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+
+	if w.BlockSize == 0 {
+		return w.writeBlock(p, false)
+	}
+
+	w.inBuf = append(w.inBuf, p...)
+	var pos int
+	for pos = 0; pos+w.BlockSize <= len(w.inBuf) && w.err == nil; pos += w.BlockSize {
+		w.writeBlock(w.inBuf[pos:pos+w.BlockSize], false)
+	}
+	if pos > 0 {
+		n := copy(w.inBuf, w.inBuf[pos:])
+		w.inBuf = w.inBuf[:n]
+	}
+
+	return len(p), w.err
+}
+
+func (w *Writer) writeBlock(p []byte, lastBlock bool) (n int, err error) {
+	w.outBuf = w.outBuf[:0]
+	w.matches = w.MatchFinder.FindMatches(w.matches[:0], p)
+	w.outBuf = w.Encoder.Encode(w.outBuf, p, w.matches, lastBlock)
+	_, w.err = w.Dest.Write(w.outBuf)
+	return len(p), w.err
+}
+
+func (w *Writer) Close() error {
+	w.writeBlock(w.inBuf, true)
+	w.inBuf = w.inBuf[:0]
+	return w.err
+}
+
+func (w *Writer) Reset(newDest io.Writer) {
+	w.MatchFinder.Reset()
+	w.Encoder.Reset()
+	w.err = nil
+	w.inBuf = w.inBuf[:0]
+	w.outBuf = w.outBuf[:0]
+	w.matches = w.matches[:0]
+	w.Dest = newDest
+}
@@ -0,0 +1,53 @@
+package matchfinder
+
+import "fmt"
+
+// A TextEncoder is an Encoder that produces a human-readable representation of
+// the LZ77 compression. Matches are replaced with <Length,Distance> symbols.
+type TextEncoder struct{}
+
+func (t TextEncoder) Reset() {}
+
+func (t TextEncoder) Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte {
+	pos := 0
+	for _, m := range matches {
+		if m.Unmatched > 0 {
+			dst = append(dst, src[pos:pos+m.Unmatched]...)
+			pos += m.Unmatched
+		}
+		if m.Length > 0 {
+			dst = append(dst, []byte(fmt.Sprintf("<%d,%d>", m.Length, m.Distance))...)
+			pos += m.Length
+		}
+	}
+	if pos < len(src) {
+		dst = append(dst, src[pos:]...)
+	}
+	return dst
+}
+
+// A NoMatchFinder implements MatchFinder, but doesn't find any matches.
+// It can be used to implement the equivalent of the standard library flate package's
+// HuffmanOnly setting.
+type NoMatchFinder struct{}
+
+func (n NoMatchFinder) Reset() {}
+
+func (n NoMatchFinder) FindMatches(dst []Match, src []byte) []Match {
+	return append(dst, Match{
+		Unmatched: len(src),
+	})
+}
+
+// AutoReset wraps a MatchFinder that can return references to data in previous
+// blocks, and calls Reset before each block. It is useful for (e.g.) using a
+// snappy Encoder with a MatchFinder designed for flate. (Snappy doesn't
+// support references between blocks.)
+type AutoReset struct {
+	MatchFinder
+}
+
+func (a AutoReset) FindMatches(dst []Match, src []byte) []Match {
+	a.Reset()
+	return a.MatchFinder.FindMatches(dst, src)
+}