This commit is contained in:
GitLab Deploy Bot
2025-10-21 23:45:13 +07:00
parent 6c387b420c
commit bb60e987e5
3548 changed files with 4952576 additions and 116 deletions
+45
View File
@@ -0,0 +1,45 @@
package matchfinder
// An absoluteMatch is like a Match, but it stores indexes into the byte
// stream instead of lengths.
type absoluteMatch struct {
// Start is the index of the first byte.
Start int
// End is the index of the byte after the last byte
// (so that End - Start = Length).
End int
// Match is the index of the previous data that matches
// (Start - Match = Distance).
Match int
}
// A matchEmitter manages the output of matches for a MatchFinder.
type matchEmitter struct {
// Dst is the destination slice that Matches are added to.
Dst []Match
// NextEmit is the index of the next byte to emit.
NextEmit int
}
func (e *matchEmitter) emit(m absoluteMatch) {
e.Dst = append(e.Dst, Match{
Unmatched: m.Start - e.NextEmit,
Length: m.End - m.Start,
Distance: m.Start - m.Match,
})
e.NextEmit = m.End
}
// trim shortens m if it extends past maxEnd. Then if the length is at least
// minLength, the match is emitted.
func (e *matchEmitter) trim(m absoluteMatch, maxEnd int, minLength int) {
if m.End > maxEnd {
m.End = maxEnd
}
if m.End-m.Start >= minLength {
e.emit(m)
}
}
+169
View File
@@ -0,0 +1,169 @@
package matchfinder
import (
"encoding/binary"
)
// M0 is an implementation of the MatchFinder interface based
// on the algorithm used by snappy, but modified to be more like the algorithm
// used by compression level 0 of the brotli reference implementation.
//
// It has a maximum block size of 65536 bytes.
type M0 struct {
// Lazy turns on "lazy matching," for higher compression but less speed.
Lazy bool
MaxDistance int
MaxLength int
}
func (M0) Reset() {}
const (
m0HashLen = 5
m0TableBits = 14
m0TableSize = 1 << m0TableBits
m0Shift = 32 - m0TableBits
// m0TableMask is redundant, but helps the compiler eliminate bounds
// checks.
m0TableMask = m0TableSize - 1
)
func (m M0) hash(data uint64) uint64 {
hash := (data << (64 - 8*m0HashLen)) * hashMul64
return hash >> (64 - m0TableBits)
}
// FindMatches looks for matches in src, appends them to dst, and returns dst.
// src must not be longer than 65536 bytes.
func (m M0) FindMatches(dst []Match, src []byte) []Match {
const inputMargin = 16 - 1
const minNonLiteralBlockSize = 1 + 1 + inputMargin
if len(src) < minNonLiteralBlockSize {
dst = append(dst, Match{
Unmatched: len(src),
})
return dst
}
if len(src) > 65536 {
panic("block too long")
}
var table [m0TableSize]uint16
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
nextHash := m.hash(binary.LittleEndian.Uint64(src[s:]))
for {
// Copied from the C++ snappy implementation:
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
// scanned (or skipped), look at every third byte, etc.. When a match
// is found, immediately go back to looking at every byte. This is a
// small loss (~5% performance, ~0.1% density) for compressible data
// due to more bookkeeping, but for non-compressible data (such as
// JPEG) it's a huge win since the compressor quickly "realizes" the
// data is incompressible and doesn't bother looking for matches
// everywhere.
//
// The "skip" variable keeps track of how many bytes there are since
// the last match; dividing it by 32 (ie. right-shifting by five) gives
// the number of bytes to move ahead for each iteration.
skip := 32
nextS := s
candidate := 0
for {
s = nextS
bytesBetweenHashLookups := skip >> 5
nextS = s + bytesBetweenHashLookups
skip += bytesBetweenHashLookups
if nextS > sLimit {
goto emitRemainder
}
candidate = int(table[nextHash&m0TableMask])
table[nextHash&m0TableMask] = uint16(s)
nextHash = m.hash(binary.LittleEndian.Uint64(src[nextS:]))
if m.MaxDistance != 0 && s-candidate > m.MaxDistance {
continue
}
if binary.LittleEndian.Uint32(src[s:]) == binary.LittleEndian.Uint32(src[candidate:]) {
break
}
}
// Invariant: we have a 4-byte match at s.
base := s
s = extendMatch(src, candidate+4, s+4)
origBase := base
if m.Lazy && base+1 < sLimit {
newBase := base + 1
h := m.hash(binary.LittleEndian.Uint64(src[newBase:]))
newCandidate := int(table[h&m0TableMask])
table[h&m0TableMask] = uint16(newBase)
okDistance := true
if m.MaxDistance != 0 && newBase-newCandidate > m.MaxDistance {
okDistance = false
}
if okDistance && binary.LittleEndian.Uint32(src[newBase:]) == binary.LittleEndian.Uint32(src[newCandidate:]) {
newS := extendMatch(src, newCandidate+4, newBase+4)
if newS-newBase > s-base+1 {
s = newS
base = newBase
candidate = newCandidate
}
}
}
if m.MaxLength != 0 && s-base > m.MaxLength {
s = base + m.MaxLength
}
dst = append(dst, Match{
Unmatched: base - nextEmit,
Length: s - base,
Distance: base - candidate,
})
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
if m.Lazy {
// If lazy matching is enabled, we update the hash table for
// every byte in the match.
for i := origBase + 2; i < s-1; i++ {
x := binary.LittleEndian.Uint64(src[i:])
table[m.hash(x)&m0TableMask] = uint16(i)
}
}
// We could immediately start working at s now, but to improve
// compression we first update the hash table at s-1 and at s.
x := binary.LittleEndian.Uint64(src[s-1:])
prevHash := m.hash(x >> 0)
table[prevHash&m0TableMask] = uint16(s - 1)
nextHash = m.hash(x >> 8)
}
emitRemainder:
if nextEmit < len(src) {
dst = append(dst, Match{
Unmatched: len(src) - nextEmit,
})
}
return dst
}
+297
View File
@@ -0,0 +1,297 @@
package matchfinder
import (
"encoding/binary"
"math/bits"
"runtime"
)
// M4 is an implementation of the MatchFinder
// interface that uses a hash table to find matches,
// optional match chains,
// and the advanced parsing technique from
// https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html.
type M4 struct {
// MaxDistance is the maximum distance (in bytes) to look back for
// a match. The default is 65535.
MaxDistance int
// MinLength is the length of the shortest match to return.
// The default is 4.
MinLength int
// HashLen is the number of bytes to use to calculate the hashes.
// The maximum is 8 and the default is 6.
HashLen int
// TableBits is the number of bits in the hash table indexes.
// The default is 17 (128K entries).
TableBits int
// ChainLength is how many entries to search on the "match chain" of older
// locations with the same hash as the current location.
ChainLength int
// DistanceBitCost is used when comparing two matches to see
// which is better. The comparison is primarily based on the length
// of the matches, but it can also take the distance into account,
// in terms of the number of bits needed to represent the distance.
// One byte of length is given a score of 256, so 32 (256/8) would
// be a reasonable first guess for the value of one bit.
// (The default is 0, which bases the comparison solely on length.)
DistanceBitCost int
table []uint32
chain []uint16
history []byte
}
func (q *M4) Reset() {
for i := range q.table {
q.table[i] = 0
}
q.history = q.history[:0]
q.chain = q.chain[:0]
}
func (q *M4) score(m absoluteMatch) int {
return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost
}
func (q *M4) FindMatches(dst []Match, src []byte) []Match {
if q.MaxDistance == 0 {
q.MaxDistance = 65535
}
if q.MinLength == 0 {
q.MinLength = 4
}
if q.HashLen == 0 {
q.HashLen = 6
}
if q.TableBits == 0 {
q.TableBits = 17
}
if len(q.table) < 1<<q.TableBits {
q.table = make([]uint32, 1<<q.TableBits)
}
e := matchEmitter{Dst: dst}
if len(q.history) > q.MaxDistance*2 {
// Trim down the history buffer.
delta := len(q.history) - q.MaxDistance
copy(q.history, q.history[delta:])
q.history = q.history[:q.MaxDistance]
if q.ChainLength > 0 {
q.chain = q.chain[:q.MaxDistance]
}
for i, v := range q.table {
newV := int(v) - delta
if newV < 0 {
newV = 0
}
q.table[i] = uint32(newV)
}
}
// Append src to the history buffer.
e.NextEmit = len(q.history)
q.history = append(q.history, src...)
if q.ChainLength > 0 {
q.chain = append(q.chain, make([]uint16, len(src))...)
}
src = q.history
// matches stores the matches that have been found but not emitted,
// in reverse order. (matches[0] is the most recent one.)
var matches [3]absoluteMatch
for i := e.NextEmit; i < len(src)-7; i++ {
if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
// We have found some matches, and we're far enough along that we probably
// won't find overlapping matches, so we might as well emit them.
if matches[1] != (absoluteMatch{}) {
e.trim(matches[1], matches[0].Start, q.MinLength)
}
e.emit(matches[0])
matches = [3]absoluteMatch{}
}
// Calculate and store the hash.
h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits)
candidate := int(q.table[h])
q.table[h] = uint32(i)
if q.ChainLength > 0 && candidate != 0 {
delta := i - candidate
if delta < 1<<16 {
q.chain[i] = uint16(delta)
}
}
if i < matches[0].End && i != matches[0].End+2-q.HashLen {
continue
}
if candidate == 0 || i-candidate > q.MaxDistance {
continue
}
// Look for a match.
var currentMatch absoluteMatch
if i-candidate != matches[0].Start-matches[0].Match {
if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
m := extendMatch2(src, i, candidate, e.NextEmit)
if m.End-m.Start > q.MinLength {
currentMatch = m
}
}
}
for j := 0; j < q.ChainLength; j++ {
delta := q.chain[candidate]
if delta == 0 {
break
}
candidate -= int(delta)
if candidate <= 0 || i-candidate > q.MaxDistance {
break
}
if i-candidate != matches[0].Start-matches[0].Match {
if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
m := extendMatch2(src, i, candidate, e.NextEmit)
if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) {
currentMatch = m
}
}
}
}
if currentMatch.End-currentMatch.Start < q.MinLength {
continue
}
overlapPenalty := 0
if matches[0] != (absoluteMatch{}) {
overlapPenalty = 275
if currentMatch.Start <= matches[1].End {
// This match would completely replace the previous match,
// so there is no penalty for overlap.
overlapPenalty = 0
}
}
if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty {
continue
}
matches = [3]absoluteMatch{
currentMatch,
matches[0],
matches[1],
}
if matches[2] == (absoluteMatch{}) {
continue
}
// We have three matches, so it's time to emit one and/or eliminate one.
switch {
case matches[0].Start < matches[2].End:
// The first and third matches overlap; discard the one in between.
matches = [3]absoluteMatch{
matches[0],
matches[2],
absoluteMatch{},
}
case matches[0].Start < matches[2].End+q.MinLength:
// The first and third matches don't overlap, but there's no room for
// another match between them. Emit the first match and discard the second.
e.emit(matches[2])
matches = [3]absoluteMatch{
matches[0],
absoluteMatch{},
absoluteMatch{},
}
default:
// Emit the first match, shortening it if necessary to avoid overlap with the second.
e.trim(matches[2], matches[1].Start, q.MinLength)
matches[2] = absoluteMatch{}
}
}
// We've found all the matches now; emit the remaining ones.
if matches[1] != (absoluteMatch{}) {
e.trim(matches[1], matches[0].Start, q.MinLength)
}
if matches[0] != (absoluteMatch{}) {
e.emit(matches[0])
}
dst = e.Dst
if e.NextEmit < len(src) {
dst = append(dst, Match{
Unmatched: len(src) - e.NextEmit,
})
}
return dst
}
const hashMul64 = 0x1E35A7BD1E35A7BD
// extendMatch returns the largest k such that k <= len(src) and that
// src[i:i+k-j] and src[j:k] have the same contents.
//
// It assumes that:
//
// 0 <= i && i < j && j <= len(src)
func extendMatch(src []byte, i, j int) int {
switch runtime.GOARCH {
case "amd64":
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
for j+8 < len(src) {
iBytes := binary.LittleEndian.Uint64(src[i:])
jBytes := binary.LittleEndian.Uint64(src[j:])
if iBytes != jBytes {
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
// the index of the first byte that differs. The BSF instruction finds the
// least significant 1 bit, the amd64 architecture is little-endian, and
// the shift by 3 converts a bit index to a byte index.
return j + bits.TrailingZeros64(iBytes^jBytes)>>3
}
i, j = i+8, j+8
}
case "386":
// On a 32-bit CPU, we do it 4 bytes at a time.
for j+4 < len(src) {
iBytes := binary.LittleEndian.Uint32(src[i:])
jBytes := binary.LittleEndian.Uint32(src[j:])
if iBytes != jBytes {
return j + bits.TrailingZeros32(iBytes^jBytes)>>3
}
i, j = i+4, j+4
}
}
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
}
return j
}
// Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it
// upward as far as possible, and downward no farther than to min.
func extendMatch2(src []byte, start, candidate, min int) absoluteMatch {
end := extendMatch(src, candidate+4, start+4)
for start > min && candidate > 0 && src[start-1] == src[candidate-1] {
start--
candidate--
}
return absoluteMatch{
Start: start,
End: end,
Match: candidate,
}
}
+103
View File
@@ -0,0 +1,103 @@
// The matchfinder package defines reusable components for data compression.
//
// Many compression libraries have two main parts:
// - Something that looks for repeated sequences of bytes
// - An encoder for the compressed data format (often an entropy coder)
//
// Although these are logically two separate steps, the implementations are
// usually closely tied together. You can't use flate's matcher with snappy's
// encoder, for example. This package defines interfaces and an intermediate
// representation to allow mixing and matching compression components.
package matchfinder
import "io"
// A Match is the basic unit of LZ77 compression.
type Match struct {
Unmatched int // the number of unmatched bytes since the previous match
Length int // the number of bytes in the matched string; it may be 0 at the end of the input
Distance int // how far back in the stream to copy from
}
// A MatchFinder performs the LZ77 stage of compression, looking for matches.
type MatchFinder interface {
// FindMatches looks for matches in src, appends them to dst, and returns dst.
FindMatches(dst []Match, src []byte) []Match
// Reset clears any internal state, preparing the MatchFinder to be used with
// a new stream.
Reset()
}
// An Encoder encodes the data in its final format.
type Encoder interface {
// Encode appends the encoded format of src to dst, using the match
// information from matches.
Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte
// Reset clears any internal state, preparing the Encoder to be used with
// a new stream.
Reset()
}
// A Writer uses MatchFinder and Encoder to write compressed data to Dest.
type Writer struct {
Dest io.Writer
MatchFinder MatchFinder
Encoder Encoder
// BlockSize is the number of bytes to compress at a time. If it is zero,
// each Write operation will be treated as one block.
BlockSize int
err error
inBuf []byte
outBuf []byte
matches []Match
}
func (w *Writer) Write(p []byte) (n int, err error) {
if w.err != nil {
return 0, w.err
}
if w.BlockSize == 0 {
return w.writeBlock(p, false)
}
w.inBuf = append(w.inBuf, p...)
var pos int
for pos = 0; pos+w.BlockSize <= len(w.inBuf) && w.err == nil; pos += w.BlockSize {
w.writeBlock(w.inBuf[pos:pos+w.BlockSize], false)
}
if pos > 0 {
n := copy(w.inBuf, w.inBuf[pos:])
w.inBuf = w.inBuf[:n]
}
return len(p), w.err
}
func (w *Writer) writeBlock(p []byte, lastBlock bool) (n int, err error) {
w.outBuf = w.outBuf[:0]
w.matches = w.MatchFinder.FindMatches(w.matches[:0], p)
w.outBuf = w.Encoder.Encode(w.outBuf, p, w.matches, lastBlock)
_, w.err = w.Dest.Write(w.outBuf)
return len(p), w.err
}
func (w *Writer) Close() error {
w.writeBlock(w.inBuf, true)
w.inBuf = w.inBuf[:0]
return w.err
}
func (w *Writer) Reset(newDest io.Writer) {
w.MatchFinder.Reset()
w.Encoder.Reset()
w.err = nil
w.inBuf = w.inBuf[:0]
w.outBuf = w.outBuf[:0]
w.matches = w.matches[:0]
w.Dest = newDest
}
+53
View File
@@ -0,0 +1,53 @@
package matchfinder
import "fmt"
// A TextEncoder is an Encoder that produces a human-readable representation of
// the LZ77 compression. Matches are replaced with <Length,Distance> symbols.
type TextEncoder struct{}
func (t TextEncoder) Reset() {}
func (t TextEncoder) Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte {
pos := 0
for _, m := range matches {
if m.Unmatched > 0 {
dst = append(dst, src[pos:pos+m.Unmatched]...)
pos += m.Unmatched
}
if m.Length > 0 {
dst = append(dst, []byte(fmt.Sprintf("<%d,%d>", m.Length, m.Distance))...)
pos += m.Length
}
}
if pos < len(src) {
dst = append(dst, src[pos:]...)
}
return dst
}
// A NoMatchFinder implements MatchFinder, but doesn't find any matches.
// It can be used to implement the equivalent of the standard library flate package's
// HuffmanOnly setting.
type NoMatchFinder struct{}
func (n NoMatchFinder) Reset() {}
func (n NoMatchFinder) FindMatches(dst []Match, src []byte) []Match {
return append(dst, Match{
Unmatched: len(src),
})
}
// AutoReset wraps a MatchFinder that can return references to data in previous
// blocks, and calls Reset before each block. It is useful for (e.g.) using a
// snappy Encoder with a MatchFinder designed for flate. (Snappy doesn't
// support references between blocks.)
type AutoReset struct {
MatchFinder
}
func (a AutoReset) FindMatches(dst []Match, src []byte) []Match {
a.Reset()
return a.MatchFinder.FindMatches(dst, src)
}