From 740d57a71dfc4d6a552cf75e29ac5cad847129af Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 23 Apr 2021 13:05:08 +0200 Subject: [PATCH] zstd: Improve better compression Try to find a better match by searching for a long match at the end of the current best match Before/after pairs.. Speed comparison not reliable, different Go versions. ``` silesia.tar zskp 3 211947520 65177448 1899 106.44 silesia.tar zskp 3 211947520 64595893 2007 100.68 gob-stream zskp 3 1911399616 185792019 9324 195.48 gob-stream zskp 3 1911399616 175034659 9636 189.17 enwik9 zskp 3 1000000000 294540704 11725 81.34 enwik9 zskp 3 1000000000 292243069 12162 78.41 github-june-2days-2019.json zskp 3 6273951764 537511906 29252 204.54 github-june-2days-2019.json zskp 3 6273951764 524340691 34043 175.75 rawstudio-mint14.tar zskp 3 8558382592 3224594213 71751 113.75 rawstudio-mint14.tar zskp 3 8558382592 3158085214 77675 105.08 nyc-taxi-data-10M.csv zskp 3 3325605752 538490114 25683 123.49 nyc-taxi-data-10M.csv zskp 3 3325605752 530289687 25239 125.66 ``` --- zstd/enc_better.go | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/zstd/enc_better.go b/zstd/enc_better.go index c2ce4a2bac..ca8f1f2467 100644 --- a/zstd/enc_better.go +++ b/zstd/enc_better.go @@ -412,8 +412,41 @@ encodeLoop: cv = load6432(src, s) } - // A 4-byte match has been found. Update recent offsets. - // We'll later see if more than 4 bytes. + // Try to find a better match by searching for a long match at the end of the current best match + if true && s+matched < sLimit { + nextHashL := hash8(load6432(src, s+matched), betterLongTableBits) + cv := load3232(src, s) + candidateL := e.longTable[nextHashL] + coffsetL := candidateL.offset - e.cur - matched + if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { + // Found a long match, at least 4 bytes. + matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4 + if matchedNext > matched { + t = coffsetL + matched = matchedNext + if debugMatches { + println("long match at end-of-match") + } + } + } + + // Check prev long... + if true { + coffsetL = candidateL.prev - e.cur - matched + if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { + // Found a long match, at least 4 bytes. + matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4 + if matchedNext > matched { + t = coffsetL + matched = matchedNext + if debugMatches { + println("prev long match at end-of-match") + } + } + } + } + } + // A match has been found. Update recent offsets. offset2 = offset1 offset1 = s - t