From 8b662725461b0ed6501f40340040bf69a564669b Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 5 Feb 2021 12:33:26 +0100 Subject: [PATCH 01/10] s2: Add AMD64 assembly for better mode --- s2/_generate/gen.go | 790 +++++++++- s2/encode_amd64.go | 32 + s2/encode_better.go | 2 +- s2/encode_go.go | 10 + s2/encodeblock_amd64.go | 28 + s2/encodeblock_amd64.s | 3255 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 4032 insertions(+), 85 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index f1c345a027..af096dca6a 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -46,6 +46,11 @@ func main() { o.genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 4, limit10B) o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B) + o.snappy = false o.maxLen = math.MaxUint32 o.genEmitLiteral() @@ -722,103 +727,720 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m RET() } -// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. -// Checks if base == nextemit. -// src & base are untouched. -func (o options) emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string) { - nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64() - MOVL(nextEmitL, nextEmit) - CMPL(nextEmit, base.As32()) - JEQ(LabelRef("emit_literal_skip_" + name)) - MOVL(base.As32(), litLen.As32()) +func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) { + TEXT(name, 0, "func(dst, src []byte) int") + Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", + fmt.Sprintf("Maximum input %d bytes.", maxLen), + "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") + Pragma("noescape") - // Base is now next emit. - MOVL(base.As32(), nextEmitL) + if lHashBytes > 7 || lHashBytes <= 4 { + panic("lHashBytes must be <= 7 and >4") + } + const literalMaxOverhead = 4 + var sTableBits = lTableBits - 2 + const sHashBytes = 4 + o.maxLen = maxLen - // litBase = src[nextEmitL:] - LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) - SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit + var lTableSize = 4 * (1 << lTableBits) + var sTableSize = 4 * (1 << sTableBits) - // Load (and store when we return) - MOVQ(dstBase, dstBaseTmp) - o.emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), true) - Label("emit_literal_done_" + name) + // Memzero needs at least 128 bytes. + if (lTableSize + sTableSize) < 128 { + panic("tableSize must be at least 128 bytes") + } - // Emitted length must be > litlen. - // We have already checked for len(0) above. - assert(func(ok LabelRef) { - tmp := GP64() - MOVQ(dstBaseTmp, tmp) - SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase - // if tmp > litLen: ok - CMPQ(tmp, litLen.As64()) - JG(ok) - }) - // Store updated dstBase - MOVQ(dstBaseTmp, dstBase) - Label("emit_literal_skip_" + name) -} + lenSrcBasic, err := Param("src").Len().Resolve() + if err != nil { + panic(err) + } + lenSrcQ := lenSrcBasic.Addr -// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. -// Checks if base == nextemit. -// src & base are untouched. -func (o options) emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string) { - nextEmit, litLen, litBase := GP32(), GP32(), GP64() - MOVL(nextEmitL, nextEmit) - CMPL(nextEmit, base.As32()) - JEQ(LabelRef("emit_literal_done_" + name)) - MOVL(base.As32(), litLen.As32()) + lenDstBasic, err := Param("dst").Len().Resolve() + if err != nil { + panic(err) + } + lenDstQ := lenDstBasic.Addr - // Base is now next emit. - MOVL(base.As32(), nextEmitL) + // Bail if we can't compress to at least this. + dstLimitPtrQ := AllocLocal(8) - // litBase = src[nextEmitL:] - LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) - SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit + // sLimitL is when to stop looking for offset/length copies. + sLimitL := AllocLocal(4) - // Load (and store when we return) - o.emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), true) - Label("emit_literal_done_" + name) -} + // nextEmitL keeps track of the point we have emitted to. + nextEmitL := AllocLocal(4) -type hashGen struct { - bytes int - tablebits int - mulreg reg.GPVirtual -} + // Repeat stores the last match offset. + repeatL := AllocLocal(4) -// hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value. -func hashN(hashBytes, tablebits int) hashGen { - h := hashGen{ - bytes: hashBytes, - tablebits: tablebits, - mulreg: GP64(), + // nextSTempL keeps nextS while other functions are being called. + nextSTempL := AllocLocal(4) + + // Alloc table last, lTab must be before sTab. + lTab := AllocLocal(lTableSize) + sTab := AllocLocal(sTableSize) + + dst := GP64() + { + dstBaseBasic, err := Param("dst").Base().Resolve() + if err != nil { + panic(err) + } + dstBaseQ := dstBaseBasic.Addr + MOVQ(dstBaseQ, dst) } - primebytes := uint64(0) - switch hashBytes { - case 3: - primebytes = 506832829 - case 4: - primebytes = 2654435761 - case 5: - primebytes = 889523592379 - case 6: - primebytes = 227718039650203 - case 7: - primebytes = 58295818150454627 - case 8: - primebytes = 0xcf1bbcdcb7a56463 - default: - panic("invalid hash length") + + srcBaseBasic, err := Param("src").Base().Resolve() + if err != nil { + panic(err) } - MOVQ(Imm(primebytes), h.mulreg) - return h -} + srcBaseQ := srcBaseBasic.Addr -// hash uses multiply to get hash of the value. -func (h hashGen) hash(val reg.GPVirtual) { - // Move value to top of register. - SHLQ(U8(64-8*h.bytes), val) + // Zero table + { + iReg := GP64() + MOVQ(U32((sTableSize+lTableSize)/8/16), iReg) + tablePtr := GP64() + LEAQ(lTab, tablePtr) + zeroXmm := XMM() + PXOR(zeroXmm, zeroXmm) + + Label("zero_loop_" + name) + for i := 0; i < 8; i++ { + MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16}) + } + ADDQ(U8(16*8), tablePtr) + DECQ(iReg) + JNZ(LabelRef("zero_loop_" + name)) + } + + { + // nextEmit is offset n src where the next emitLiteral should start from. + MOVL(U32(0), nextEmitL) + + const inputMargin = 8 + tmp, tmp2, tmp3 := GP64(), GP64(), GP64() + MOVQ(lenSrcQ, tmp) + LEAQ(Mem{Base: tmp, Disp: -5}, tmp2) + // sLimitL := len(src) - inputMargin + LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) + + assert(func(ok LabelRef) { + CMPQ(tmp3, lenSrcQ) + JL(ok) + }) + + MOVL(tmp3.As32(), sLimitL) + + // dstLimit := (len(src) - 5 ) - len(src)>>5 + SHRQ(U8(5), tmp) + SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp + + assert(func(ok LabelRef) { + // if len(src) > len(src) - len(src)>>5 - 5: ok + CMPQ(lenSrcQ, tmp2) + JGE(ok) + }) + + LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2) + MOVQ(tmp2, dstLimitPtrQ) + } + + // s = 1 + s := GP32() + MOVL(U32(1), s) + // repeatL = 1 + MOVL(s, repeatL) + + src := GP64() + Load(Param("src").Base(), src) + + // Load cv + Label("search_loop_" + name) + candidate := GP32() + { + assert(func(ok LabelRef) { + // Check if somebody changed src + tmp := GP64() + MOVQ(srcBaseQ, tmp) + CMPQ(tmp, src) + JEQ(ok) + }) + + cv := GP64() + MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) + nextS := GP32() + // nextS := s + (s-nextEmit)>>skipLog + 1 + { + tmp := GP64() + MOVL(s, tmp.As32()) // tmp = s + SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit + SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog + LEAL(Mem{Base: s, Disp: 1, Index: tmp, Scale: 1}, nextS) + } + // if nextS > sLimit {goto emitRemainder} + { + CMPL(nextS.As32(), sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + } + assert(func(ok LabelRef) { + // Check if s is valid (we should have jumped above if not) + tmp := GP64() + MOVQ(lenSrcQ, tmp) + CMPQ(tmp, s.As64()) + JG(ok) + }) + // move nextS to stack. + MOVL(nextS.As32(), nextSTempL) + + candidateS := GP32() + lHasher := hashN(lHashBytes, lTableBits) + { + sHasher := hashN(sHashBytes, sTableBits) + hash0, hash1 := GP64(), GP64() + MOVQ(cv, hash0) + MOVQ(cv, hash1) + lHasher.hash(hash0) + sHasher.hash(hash1) + MOVL(lTab.Idx(hash0, 4), candidate) + MOVL(sTab.Idx(hash1, 4), candidateS) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + + MOVL(s, lTab.Idx(hash0, 4)) + MOVL(s, sTab.Idx(hash1, 4)) + } + + // En/disable repeat matching. + if true { + // Check repeat at offset checkRep + const checkRep = 1 + { + // rep = s - repeat + rep := GP32() + MOVL(s, rep) + SUBL(repeatL, rep) // rep = s - repeat + + // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + left, right := GP64(), GP64() + MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32()) + MOVQ(cv, left) + SHRQ(U8(checkRep*8), left) + CMPL(left.As32(), right.As32()) + // BAIL, no repeat. + JNE(LabelRef("no_repeat_found_" + name)) + } + // base = s + checkRep + base := GP32() + LEAL(Mem{Base: s, Disp: checkRep}, base) + + // nextEmit before repeat. + nextEmit := GP32() + MOVL(nextEmitL, nextEmit) + + // Extend back + if true { + i := GP32() + MOVL(base, i) + SUBL(repeatL, i) + JZ(LabelRef("repeat_extend_back_end_" + name)) + + Label("repeat_extend_back_loop_" + name) + // if base <= nextemit {exit} + CMPL(base.As32(), nextEmit) + JLE(LabelRef("repeat_extend_back_end_" + name)) + // if src[i-1] == src[base-1] + tmp, tmp2 := GP64(), GP64() + MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8()) + MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8()) + CMPB(tmp.As8(), tmp2.As8()) + JNE(LabelRef("repeat_extend_back_end_" + name)) + LEAL(Mem{Base: base, Disp: -1}, base) + DECL(i) + JNZ(LabelRef("repeat_extend_back_loop_" + name)) + } + Label("repeat_extend_back_end_" + name) + + // Base is now at start. Emit until base. + // d += emitLiteral(dst[d:], src[nextEmit:base]) + if true { + o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name) + } + + // Extend forward + { + // s += 4 + checkRep + ADDL(U8(4+checkRep), s) + + if true { + // candidate := s - repeat + 4 + checkRep + MOVL(s, candidate) + SUBL(repeatL, candidate) // candidate = s - repeat + + // srcLeft = len(src) - s + srcLeft := GP64() + MOVQ(lenSrcQ, srcLeft) + SUBL(s, srcLeft.As32()) + assert(func(ok LabelRef) { + // if srcleft < maxint32: ok + CMPQ(srcLeft, U32(0x7fffffff)) + JL(ok) + }) + // Forward address + forwardStart := GP64() + LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart) + // End address + backStart := GP64() + LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart) + + length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name)) + forwardStart, backStart, srcLeft = nil, nil, nil + Label("repeat_extend_forward_end_" + name) + // s+= length + ADDL(length.As32(), s) + } + } + // Emit + if true { + // length = s-base + length := GP32() + MOVL(s, length) + SUBL(base.As32(), length) // length = s - base + + offsetVal := GP32() + MOVL(repeatL, offsetVal) + + if !o.snappy { + // if nextEmit == 0 {do copy instead...} + TESTL(nextEmit, nextEmit) + JZ(LabelRef("repeat_as_copy_" + name)) + + // Emit as repeat... + o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) + + // Emit as copy instead... + Label("repeat_as_copy_" + name) + } + o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) + + Label("repeat_end_emit_" + name) + // Store new dst and nextEmit + MOVL(s, nextEmitL) + } + // if s >= sLimit is picked up on next loop. + if false { + CMPL(s.As32(), sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + } + JMP(LabelRef("search_loop_" + name)) + } + Label("no_repeat_found_" + name) + { + // Check candidates are ok. All must be < s and < len(src) + assert(func(ok LabelRef) { + tmp := GP64() + MOVQ(lenSrcQ, tmp) + CMPL(tmp.As32(), candidate) + JG(ok) + }) + assert(func(ok LabelRef) { + CMPL(s, candidate) + JG(ok) + }) + assert(func(ok LabelRef) { + tmp := GP64() + MOVQ(lenSrcQ, tmp) + CMPL(tmp.As32(), candidateS) + JG(ok) + }) + assert(func(ok LabelRef) { + CMPL(s, candidateS) + JG(ok) + }) + + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidate_match_" + name)) + + //if uint32(cv) == load32(src, candidateS) + CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidateS_match_" + name)) + + // No match found, next loop + // s = nextS + MOVL(nextSTempL, s) + JMP(LabelRef("search_loop_" + name)) + + // Short match at s, try a long candidate at s+1 + Label("candidateS_match_" + name) + if true { + hash0 := GP64() + SHRQ(U8(8), cv) + MOVQ(cv, hash0) + lHasher.hash(hash0) + MOVL(lTab.Idx(hash0, 4), candidate) + INCL(s) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + MOVL(s, lTab.Idx(hash0, 4)) + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + JEQ(LabelRef("candidate_match_" + name)) + // No match, decrement s again and use short match at s... + DECL(s) + } + MOVL(candidateS, candidate) + } + } + + Label("candidate_match_" + name) + // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. + // Extend backwards + if true { + ne := GP32() + MOVL(nextEmitL, ne) + TESTL(candidate, candidate) + JZ(LabelRef("match_extend_back_end_" + name)) + + // candidate is tested when decremented, so we loop back here. + Label("match_extend_back_loop_" + name) + // if s <= nextEmit {exit} + CMPL(s, ne) + JLE(LabelRef("match_extend_back_end_" + name)) + // if src[candidate-1] == src[s-1] + tmp, tmp2 := GP64(), GP64() + MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8()) + MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8()) + CMPB(tmp.As8(), tmp2.As8()) + JNE(LabelRef("match_extend_back_end_" + name)) + LEAL(Mem{Base: s, Disp: -1}, s) + DECL(candidate) + JZ(LabelRef("match_extend_back_end_" + name)) + JMP(LabelRef("match_extend_back_loop_" + name)) + } + Label("match_extend_back_end_" + name) + + // Bail if we exceed the maximum size. + if true { + // tmp = s-nextEmit + tmp := GP64() + MOVL(s, tmp.As32()) + SUBL(nextEmitL, tmp.As32()) + // tmp = &dst + s-nextEmit + LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp) + CMPQ(tmp, dstLimitPtrQ) + JL(LabelRef("match_dst_size_check_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + } + Label("match_dst_size_check_" + name) + { + base := GP32() + MOVL(s, base.As32()) + o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name) + } + cv := GP64() + Label("match_nolit_loop_" + name) + { + // Update repeat + { + // repeat = base - candidate + repeatVal := GP64().As32() + MOVL(s, repeatVal) + SUBL(candidate, repeatVal) + MOVL(repeatVal, repeatL) + } + // s+=4, candidate+=4 + ADDL(U8(4), s) + ADDL(U8(4), candidate) + // Extend the 4-byte match as long as possible and emit copy. + { + assert(func(ok LabelRef) { + // s must be > candidate cannot be equal. + CMPL(s, candidate) + JG(ok) + }) + // srcLeft = len(src) - s + srcLeft := GP64() + MOVQ(lenSrcQ, srcLeft) + SUBL(s, srcLeft.As32()) + assert(func(ok LabelRef) { + // if srcleft < maxint32: ok + CMPQ(srcLeft, U32(0x7fffffff)) + JL(ok) + }) + + a, b := GP64(), GP64() + LEAQ(Mem{Base: src, Index: s, Scale: 1}, a) + LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b) + length := o.matchLen("match_nolit_"+name, + a, b, + srcLeft, + LabelRef("match_nolit_end_"+name), + ) + Label("match_nolit_end_" + name) + assert(func(ok LabelRef) { + CMPL(length.As32(), U32(math.MaxInt32)) + JL(ok) + }) + a, b, srcLeft = nil, nil, nil + + // s += length (length is destroyed, use it now) + ADDL(length.As32(), s) + + // Load offset from repeat value. + offset := GP64() + MOVL(repeatL, offset.As32()) + + // length += 4 + ADDL(U8(4), length.As32()) + MOVL(s, nextEmitL) // nextEmit = s + o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) + Label("match_nolit_emitcopy_end_" + name) + + // if s >= sLimit { end } + { + CMPL(s.As32(), sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + } + // Start load candidate+1 as early as possible... + // Candidate is + 4 + MOVQ(Mem{Base: src, Index: candidate, Scale: 1, Disp: 1 - 4}, cv) + // Bail if we exceed the maximum size. + { + CMPQ(dst, dstLimitPtrQ) + JL(LabelRef("match_nolit_dst_ok_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + } + } + Label("match_nolit_dst_ok_" + name) + // cv must be set to value at candidate+1 before arriving here + if true { + lHasher := hashN(lHashBytes, lTableBits) + sHasher := hashN(sHashBytes, sTableBits) + + // Index candidate+1 long, candidate+2 short... + hash0, hash1 := GP64(), GP64() + MOVQ(cv, hash0) // src[candidate+1] + MOVQ(cv, hash1) + SHRQ(U8(8), hash1) // src[candidate+2] + cp1, cp2 := GP32(), GP32() // candidate+1, candidate + 2 + LEAL(Mem{Base: candidate, Disp: 1 - 4}, cp1) + LEAL(Mem{Base: candidate, Disp: 2 - 4}, cp2) + // Load s-2 early + MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv) + + lHasher.hash(hash0) + sHasher.hash(hash1) + + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + MOVL(cp1, lTab.Idx(hash0, 4)) + MOVL(cp2, sTab.Idx(hash1, 4)) + + // Index s-2 long, s-1 short... + MOVQ(cv, hash0) // src[s-2] + MOVQ(cv, hash1) // src[s-1] + SHRQ(U8(8), hash1) + sm1, sm2 := GP32(), GP32() // s -1, s - 2 + LEAL(Mem{Base: s, Disp: -2}, sm2) + LEAL(Mem{Base: s, Disp: -1}, sm1) + lHasher.hash(hash0) + sHasher.hash(hash1) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + MOVL(sm2, lTab.Idx(hash0, 4)) + MOVL(sm1, sTab.Idx(hash1, 4)) + } + JMP(LabelRef("search_loop_" + name)) + } + + Label("emit_remainder_" + name) + // Bail if we exceed the maximum size. + // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0 + { + // remain = len(src) - nextEmit + remain := GP64() + MOVQ(lenSrcQ, remain) + SUBL(nextEmitL, remain.As32()) + + dstExpect := GP64() + // dst := dst + (len(src)-nextEmitL) + + LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect) + CMPQ(dstExpect, dstLimitPtrQ) + JL(LabelRef("emit_remainder_ok_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) + } + MOVQ(U32(0), ri.Addr) + RET() + Label("emit_remainder_ok_" + name) + } + // emitLiteral(dst[d:], src[nextEmitL:]) + emitEnd := GP64() + MOVQ(lenSrcQ, emitEnd) + + // Emit final literals. + o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name) + + // Assert size is < limit + assert(func(ok LabelRef) { + // if dstBaseQ < dstLimitPtrQ: ok + CMPQ(dst, dstLimitPtrQ) + JL(ok) + }) + + // length := start - base (ptr arithmetic) + length := GP64() + base := Load(Param("dst").Base(), GP64()) + MOVQ(dst, length) + SUBQ(base, length) + + // Assert size is < len(src) + assert(func(ok LabelRef) { + // if len(src) >= length: ok + CMPQ(lenSrcQ, length) + JGE(ok) + }) + // Assert size is < len(dst) + assert(func(ok LabelRef) { + // if len(dst) >= length: ok + CMPQ(lenDstQ, length) + JGE(ok) + }) + Store(length, ReturnIndex(0)) + RET() +} + +// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. +// Checks if base == nextemit. +// src & base are untouched. +func (o options) emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string) { + nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64() + MOVL(nextEmitL, nextEmit) + CMPL(nextEmit, base.As32()) + JEQ(LabelRef("emit_literal_skip_" + name)) + MOVL(base.As32(), litLen.As32()) + + // Base is now next emit. + MOVL(base.As32(), nextEmitL) + + // litBase = src[nextEmitL:] + LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) + SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit + + // Load (and store when we return) + MOVQ(dstBase, dstBaseTmp) + o.emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), true) + Label("emit_literal_done_" + name) + + // Emitted length must be > litlen. + // We have already checked for len(0) above. + assert(func(ok LabelRef) { + tmp := GP64() + MOVQ(dstBaseTmp, tmp) + SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase + // if tmp > litLen: ok + CMPQ(tmp, litLen.As64()) + JG(ok) + }) + // Store updated dstBase + MOVQ(dstBaseTmp, dstBase) + Label("emit_literal_skip_" + name) +} + +// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. +// Checks if base == nextemit. +// src & base are untouched. +func (o options) emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string) { + nextEmit, litLen, litBase := GP32(), GP32(), GP64() + MOVL(nextEmitL, nextEmit) + CMPL(nextEmit, base.As32()) + JEQ(LabelRef("emit_literal_done_" + name)) + MOVL(base.As32(), litLen.As32()) + + // Base is now next emit. + MOVL(base.As32(), nextEmitL) + + // litBase = src[nextEmitL:] + LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) + SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit + + // Load (and store when we return) + o.emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), true) + Label("emit_literal_done_" + name) +} + +type hashGen struct { + bytes int + tablebits int + mulreg reg.GPVirtual +} + +// hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value. +func hashN(hashBytes, tablebits int) hashGen { + h := hashGen{ + bytes: hashBytes, + tablebits: tablebits, + mulreg: GP64(), + } + primebytes := uint64(0) + switch hashBytes { + case 3: + primebytes = 506832829 + case 4: + primebytes = 2654435761 + case 5: + primebytes = 889523592379 + case 6: + primebytes = 227718039650203 + case 7: + primebytes = 58295818150454627 + case 8: + primebytes = 0xcf1bbcdcb7a56463 + default: + panic("invalid hash length") + } + MOVQ(Imm(primebytes), h.mulreg) + return h +} + +// hash uses multiply to get hash of the value. +func (h hashGen) hash(val reg.GPVirtual) { + // Move value to top of register. + if h.bytes < 8 { + SHLQ(U8(64-8*h.bytes), val) + } IMULQ(h.mulreg, val) // Move value to bottom SHRQ(U8(64-h.tablebits), val) diff --git a/s2/encode_amd64.go b/s2/encode_amd64.go index 253f84f3c1..c3fc8d1ee3 100644 --- a/s2/encode_amd64.go +++ b/s2/encode_amd64.go @@ -36,6 +36,38 @@ func encodeBlock(dst, src []byte) (d int) { return encodeBlockAsm8B(dst, src) } +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetter(dst, src []byte) (d int) { + const ( + // Use 12 bit table when less than... + limit12B = 16 << 10 + // Use 10 bit table when less than... + limit10B = 4 << 10 + // Use 8 bit table when less than... + limit8B = 512 + ) + + if len(src) >= limit12B { + return encodeBetterBlockAsm(dst, src) + } + if len(src) >= limit10B { + return encodeBetterBlockAsm12B(dst, src) + } + if len(src) >= limit8B { + return encodeBetterBlockAsm10B(dst, src) + } + if len(src) < minNonLiteralBlockSize { + return 0 + } + return encodeBetterBlockAsm8B(dst, src) +} + // encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. diff --git a/s2/encode_better.go b/s2/encode_better.go index f4c5e04d2f..636859aa1d 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -44,7 +44,7 @@ func hash8(u uint64, h uint8) uint32 { // It also assumes that: // len(dst) >= MaxEncodedLen(len(src)) && // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize -func encodeBlockBetter(dst, src []byte) (d int) { +func encodeBlockBetterGo(dst, src []byte) (d int) { // Initialize the hash tables. const ( // Long hash matches. diff --git a/s2/encode_go.go b/s2/encode_go.go index 82f0047df6..8be2b8f86f 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -20,6 +20,16 @@ func encodeBlock(dst, src []byte) (d int) { return encodeBlockGo(dst, src) } +// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockBetter(dst, src []byte) (d int) { + return encodeBlockBetterGo(dst, src) +} + // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index 99e7d68bee..b9c89f2325 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -62,6 +62,34 @@ func encodeSnappyBlockAsm10B(dst []byte, src []byte) int //go:noescape func encodeSnappyBlockAsm8B(dst []byte, src []byte) int +// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4294967295 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm(dst []byte, src []byte) int + +// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16383 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm12B(dst []byte, src []byte) int + +// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4095 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm10B(dst []byte, src []byte) int + +// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 511 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm8B(dst []byte, src []byte) int + // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 239c1c7de1..9edb7009d0 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -7209,6 +7209,3261 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: MOVQ AX, ret+48(FP) RET +// func encodeBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -5(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x07, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + MOVL BP, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 262168(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 262168(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI + JZ repeat_extend_back_end_encodeBetterBlockAsm + +repeat_extend_back_loop_encodeBetterBlockAsm: + CMPL SI, BP + JLE repeat_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm + LEAL -1(SI), SI + DECL DI + JNZ repeat_extend_back_loop_encodeBetterBlockAsm + +repeat_extend_back_end_encodeBetterBlockAsm: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x00010000 + JLT three_bytes_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x01000000 + JLT four_bytes_repeat_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +four_bytes_repeat_emit_encodeBetterBlockAsm: + MOVL BP, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R9, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +three_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +two_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +one_byte_repeat_emit_encodeBetterBlockAsm: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP + CMPQ DI, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm + +memmove_long_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R8)(R11*1), X4 + MOVOU -16(R8)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ DI, R11 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm: + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_loop_repeat_extend_encodeBetterBlockAsm: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm + +matchlen_single_repeat_extend_encodeBetterBlockAsm: + TESTL DI, DI + JZ repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm: + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 + JNE repeat_extend_forward_end_encodeBetterBlockAsm + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm + +repeat_extend_forward_end_encodeBetterBlockAsm: + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(BP), BP + ADDQ $0x05, AX + CMPL BP, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm: + TESTL BP, BP + JZ repeat_end_emit_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm + +no_repeat_found_encodeBetterBlockAsm: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm + +candidateS_match_encodeBetterBlockAsm: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm + +match_extend_back_loop_encodeBetterBlockAsm: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm + JMP match_extend_back_loop_encodeBetterBlockAsm + +match_extend_back_end_encodeBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm: + MOVL CX, SI + MOVL 12(SP), DI + CMPL DI, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + LEAL -1(R8), DI + CMPL DI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm + CMPL DI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm + CMPL DI, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm + CMPL DI, $0x01000000 + JLT four_bytes_match_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +four_bytes_match_emit_encodeBetterBlockAsm: + MOVL DI, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW DI, 1(AX) + MOVB R9, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +three_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +two_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DI, 1(AX) + ADDQ $0x02, AX + CMPL DI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm + +one_byte_match_emit_encodeBetterBlockAsm: + SHLB $0x02, DI + MOVB DI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), DI + CMPQ R8, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm: + MOVQ DI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm + +memmove_long_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), DI + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVQ R8, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(SI)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(SI)(R11*1), X4 + MOVOU -16(SI)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ R8, R11 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ DI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm: + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), R8 + XORL R10, R10 + CMPL SI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm + +matchlen_loopback_match_nolit_encodeBetterBlockAsm: + MOVQ (DI)(R10*1), R9 + XORQ (R8)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBetterBlockAsm + +matchlen_loop_match_nolit_encodeBetterBlockAsm: + LEAL -8(SI), SI + LEAL 8(R10), R10 + CMPL SI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + +matchlen_single_match_nolit_encodeBetterBlockAsm: + TESTL SI, SI + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + MOVB (DI)(R10*1), R9 + CMPB (R8)(R10*1), R9 + JNE match_nolit_end_encodeBetterBlockAsm + LEAL 1(R10), R10 + DECL SI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + +match_nolit_end_encodeBetterBlockAsm: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + CMPL SI, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: + CMPL R10, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(R10), R10 + ADDQ $0x05, AX + CMPL R10, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + +four_bytes_remain_match_nolit_encodeBetterBlockAsm: + TESTL R10, R10 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +two_byte_offset_match_nolit_encodeBetterBlockAsm: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +emit_copy_three_match_nolit_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + MOVQ -3(DX)(BP*1), SI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, DI + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -3(BP), R11 + LEAL -2(BP), BP + MOVQ -2(DX)(CX*1), SI + SHLQ $0x08, R9 + IMULQ DI, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x32, R10 + MOVL R11, 24(SP)(R9*4) + MOVL BP, 262168(SP)(R10*4) + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -2(CX), SI + LEAL -1(CX), BP + SHLQ $0x08, R9 + IMULQ DI, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x32, R10 + MOVL SI, 24(SP)(R9*4) + MOVL BP, 262168(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm + +emit_remainder_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +four_bytes_emit_remainder_encodeBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +three_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +two_bytes_emit_remainder_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +one_byte_emit_remainder_encodeBetterBlockAsm: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm + +memmove_long_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x07, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOU 32(BP), X6 + MOVOU 48(BP), X7 + MOVOU 64(BP), X8 + MOVOU 80(BP), X9 + MOVOU 96(BP), X10 + MOVOU 112(BP), X11 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + MOVOA X6, 32(R8) + MOVOA X7, 48(R8) + MOVOA X8, 64(R8) + MOVOA X9, 80(R8) + MOVOA X10, 96(R8) + MOVOA X11, 112(R8) + ADDQ $0x80, R8 + ADDQ $0x80, BP + ADDQ $0x80, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm12B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000280, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm12B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm12B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -5(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm12B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x06, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R9*4), BP + MOVL 65560(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 65560(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm12B + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI + JZ repeat_extend_back_end_encodeBetterBlockAsm12B + +repeat_extend_back_loop_encodeBetterBlockAsm12B: + CMPL SI, BP + JLE repeat_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm12B + LEAL -1(SI), SI + DECL DI + JNZ repeat_extend_back_loop_encodeBetterBlockAsm12B + +repeat_extend_back_end_encodeBetterBlockAsm12B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm12B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + +two_bytes_repeat_emit_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + +one_byte_repeat_emit_encodeBetterBlockAsm12B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(DI*1), BP + CMPQ DI, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + +memmove_long_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(DI*1), BP + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R11*1), X4 + MOVOU -16(R8)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ DI, R11 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm12B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm12B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B: + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm12B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm12B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm12B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B + +matchlen_single_repeat_extend_encodeBetterBlockAsm12B: + TESTL DI, DI + JZ repeat_extend_forward_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B: + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 + JNE repeat_extend_forward_end_encodeBetterBlockAsm12B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B + +repeat_extend_forward_end_encodeBetterBlockAsm12B: + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm12B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm12B + +no_repeat_found_encodeBetterBlockAsm12B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B + +candidateS_match_encodeBetterBlockAsm12B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm12B + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm12B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm12B + +match_extend_back_loop_encodeBetterBlockAsm12B: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm12B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm12B + JMP match_extend_back_loop_encodeBetterBlockAsm12B + +match_extend_back_end_encodeBetterBlockAsm12B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm12B: + MOVL CX, SI + MOVL 12(SP), DI + CMPL DI, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + LEAL -1(R8), DI + CMPL DI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm12B + CMPL DI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +two_bytes_match_emit_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DI, 1(AX) + ADDQ $0x02, AX + CMPL DI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_encodeBetterBlockAsm12B + +one_byte_match_emit_encodeBetterBlockAsm12B: + SHLB $0x02, DI + MOVB DI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), DI + CMPQ R8, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm12B: + MOVQ DI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B + +memmove_long_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), DI + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVQ R8, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(SI)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(SI)(R11*1), X4 + MOVOU -16(SI)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ R8, R11 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ DI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm12B: + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), R8 + XORL R10, R10 + CMPL SI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVQ (DI)(R10*1), R9 + XORQ (R8)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeBetterBlockAsm12B: + LEAL -8(SI), SI + LEAL 8(R10), R10 + CMPL SI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B + +matchlen_single_match_nolit_encodeBetterBlockAsm12B: + TESTL SI, SI + JZ match_nolit_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVB (DI)(R10*1), R9 + CMPB (R8)(R10*1), R9 + JNE match_nolit_end_encodeBetterBlockAsm12B + LEAL 1(R10), R10 + DECL SI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B + +match_nolit_end_encodeBetterBlockAsm12B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + +two_byte_offset_match_nolit_encodeBetterBlockAsm12B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +emit_copy_three_match_nolit_encodeBetterBlockAsm12B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm12B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm12B + MOVQ -3(DX)(BP*1), SI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -3(BP), R11 + LEAL -2(BP), BP + MOVQ -2(DX)(CX*1), SI + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x34, R10 + MOVL R11, 24(SP)(R9*4) + MOVL BP, 65560(SP)(R10*4) + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -2(CX), SI + LEAL -1(CX), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x34, R10 + MOVL SI, 24(SP)(R9*4) + MOVL BP, 65560(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm12B + +emit_remainder_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm12B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm12B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm12B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +two_bytes_emit_remainder_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + +one_byte_emit_remainder_encodeBetterBlockAsm12B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + +memmove_long_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x07, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOU 32(BP), X6 + MOVOU 48(BP), X7 + MOVOU 64(BP), X8 + MOVOU 80(BP), X9 + MOVOU 96(BP), X10 + MOVOU 112(BP), X11 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + MOVOA X6, 32(R8) + MOVOA X7, 48(R8) + MOVOA X8, 64(R8) + MOVOA X9, 80(R8) + MOVOA X10, 96(R8) + MOVOA X11, 112(R8) + ADDQ $0x80, R8 + ADDQ $0x80, BP + ADDQ $0x80, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm10B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x000000a0, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -5(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm10B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R9*4), BP + MOVL 16408(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 16408(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm10B + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI + JZ repeat_extend_back_end_encodeBetterBlockAsm10B + +repeat_extend_back_loop_encodeBetterBlockAsm10B: + CMPL SI, BP + JLE repeat_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(SI), SI + DECL DI + JNZ repeat_extend_back_loop_encodeBetterBlockAsm10B + +repeat_extend_back_end_encodeBetterBlockAsm10B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm10B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + +two_bytes_repeat_emit_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + +one_byte_repeat_emit_encodeBetterBlockAsm10B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(DI*1), BP + CMPQ DI, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + +memmove_long_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(DI*1), BP + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R11*1), X4 + MOVOU -16(R8)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ DI, R11 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm10B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm10B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B: + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm10B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm10B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm10B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B + +matchlen_single_repeat_extend_encodeBetterBlockAsm10B: + TESTL DI, DI + JZ repeat_extend_forward_end_encodeBetterBlockAsm10B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B: + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 + JNE repeat_extend_forward_end_encodeBetterBlockAsm10B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B + +repeat_extend_forward_end_encodeBetterBlockAsm10B: + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm10B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm10B + +no_repeat_found_encodeBetterBlockAsm10B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm10B + +candidateS_match_encodeBetterBlockAsm10B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm10B + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm10B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm10B + +match_extend_back_loop_encodeBetterBlockAsm10B: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm10B + JMP match_extend_back_loop_encodeBetterBlockAsm10B + +match_extend_back_end_encodeBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm10B: + MOVL CX, SI + MOVL 12(SP), DI + CMPL DI, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + LEAL -1(R8), DI + CMPL DI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm10B + CMPL DI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +two_bytes_match_emit_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DI, 1(AX) + ADDQ $0x02, AX + CMPL DI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_encodeBetterBlockAsm10B + +one_byte_match_emit_encodeBetterBlockAsm10B: + SHLB $0x02, DI + MOVB DI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), DI + CMPQ R8, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm10B: + MOVQ DI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B + +memmove_long_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), DI + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVQ R8, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(SI)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(SI)(R11*1), X4 + MOVOU -16(SI)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ R8, R11 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ DI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm10B: + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), R8 + XORL R10, R10 + CMPL SI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVQ (DI)(R10*1), R9 + XORQ (R8)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeBetterBlockAsm10B: + LEAL -8(SI), SI + LEAL 8(R10), R10 + CMPL SI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B + +matchlen_single_match_nolit_encodeBetterBlockAsm10B: + TESTL SI, SI + JZ match_nolit_end_encodeBetterBlockAsm10B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVB (DI)(R10*1), R9 + CMPB (R8)(R10*1), R9 + JNE match_nolit_end_encodeBetterBlockAsm10B + LEAL 1(R10), R10 + DECL SI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B + +match_nolit_end_encodeBetterBlockAsm10B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + +two_byte_offset_match_nolit_encodeBetterBlockAsm10B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +emit_copy_three_match_nolit_encodeBetterBlockAsm10B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm10B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + MOVQ -3(DX)(BP*1), SI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -3(BP), R11 + LEAL -2(BP), BP + MOVQ -2(DX)(CX*1), SI + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x36, R10 + MOVL R11, 24(SP)(R9*4) + MOVL BP, 16408(SP)(R10*4) + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -2(CX), SI + LEAL -1(CX), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x36, R10 + MOVL SI, 24(SP)(R9*4) + MOVL BP, 16408(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm10B + +emit_remainder_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +two_bytes_emit_remainder_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + +one_byte_emit_remainder_encodeBetterBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + +memmove_long_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x07, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOU 32(BP), X6 + MOVOU 48(BP), X7 + MOVOU 64(BP), X8 + MOVOU 80(BP), X9 + MOVOU 96(BP), X10 + MOVOU 112(BP), X11 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + MOVOA X6, 32(R8) + MOVOA X7, 48(R8) + MOVOA X8, 64(R8) + MOVOA X9, 80(R8) + MOVOA X10, 96(R8) + MOVOA X11, 112(R8) + ADDQ $0x80, R8 + ADDQ $0x80, BP + ADDQ $0x80, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000028, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -5(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm8B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm8B + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 4120(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 4120(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm8B + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI + JZ repeat_extend_back_end_encodeBetterBlockAsm8B + +repeat_extend_back_loop_encodeBetterBlockAsm8B: + CMPL SI, BP + JLE repeat_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm8B + LEAL -1(SI), SI + DECL DI + JNZ repeat_extend_back_loop_encodeBetterBlockAsm8B + +repeat_extend_back_end_encodeBetterBlockAsm8B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm8B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + +two_bytes_repeat_emit_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + +one_byte_repeat_emit_encodeBetterBlockAsm8B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(DI*1), BP + CMPQ DI, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + +memmove_long_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(DI*1), BP + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R11*1), X4 + MOVOU -16(R8)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ DI, R11 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm8B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B: + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm8B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm8B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm8B: + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B + +matchlen_single_repeat_extend_encodeBetterBlockAsm8B: + TESTL DI, DI + JZ repeat_extend_forward_end_encodeBetterBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B: + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 + JNE repeat_extend_forward_end_encodeBetterBlockAsm8B + LEAL 1(R10), R10 + DECL DI + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B + +repeat_extend_forward_end_encodeBetterBlockAsm8B: + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm8B + +no_repeat_found_encodeBetterBlockAsm8B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm8B + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm8B + +candidateS_match_encodeBetterBlockAsm8B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm8B + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm8B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm8B + +match_extend_back_loop_encodeBetterBlockAsm8B: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm8B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm8B + JMP match_extend_back_loop_encodeBetterBlockAsm8B + +match_extend_back_end_encodeBetterBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm8B: + MOVL CX, SI + MOVL 12(SP), DI + CMPL DI, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + LEAL -1(R8), DI + CMPL DI, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm8B + CMPL DI, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +two_bytes_match_emit_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DI, 1(AX) + ADDQ $0x02, AX + CMPL DI, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_encodeBetterBlockAsm8B + +one_byte_match_emit_encodeBetterBlockAsm8B: + SHLB $0x02, DI + MOVB DI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), DI + CMPQ R8, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm8B: + MOVQ DI, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B + +memmove_long_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), DI + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVQ R8, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(SI)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + MOVOA X6, 32(R12) + MOVOA X7, 48(R12) + MOVOA X8, 64(R12) + MOVOA X9, 80(R12) + MOVOA X10, 96(R12) + MOVOA X11, 112(R12) + ADDQ $0x80, R12 + ADDQ $0x80, R9 + ADDQ $0x80, R11 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(SI)(R11*1), X4 + MOVOU -16(SI)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ R8, R11 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ DI, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm8B: + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), R8 + XORL R10, R10 + CMPL SI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVQ (DI)(R10*1), R9 + XORQ (R8)(R10*1), R9 + TESTQ R9, R9 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeBetterBlockAsm8B: + LEAL -8(SI), SI + LEAL 8(R10), R10 + CMPL SI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B + +matchlen_single_match_nolit_encodeBetterBlockAsm8B: + TESTL SI, SI + JZ match_nolit_end_encodeBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVB (DI)(R10*1), R9 + CMPB (R8)(R10*1), R9 + JNE match_nolit_end_encodeBetterBlockAsm8B + LEAL 1(R10), R10 + DECL SI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B + +match_nolit_end_encodeBetterBlockAsm8B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + +two_byte_offset_match_nolit_encodeBetterBlockAsm8B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: + CMPL R10, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R10*4), R10 + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +emit_copy_three_match_nolit_encodeBetterBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R10*4), R10 + MOVB R10, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm8B + MOVQ -3(DX)(BP*1), SI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, DI + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -3(BP), R11 + LEAL -2(BP), BP + MOVQ -2(DX)(CX*1), SI + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL R11, 24(SP)(R9*4) + MOVL BP, 4120(SP)(R10*4) + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + LEAL -2(CX), SI + LEAL -1(CX), BP + SHLQ $0x10, R9 + IMULQ DI, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL SI, 24(SP)(R9*4) + MOVL BP, 4120(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm8B + +emit_remainder_encodeBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 4(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBetterBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBetterBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBetterBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + +two_bytes_emit_remainder_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + +one_byte_emit_remainder_encodeBetterBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + +memmove_long_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x07, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOU 32(BP), X6 + MOVOU 48(BP), X7 + MOVOU 64(BP), X8 + MOVOU 80(BP), X9 + MOVOU 96(BP), X10 + MOVOU 112(BP), X11 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + MOVOA X6, 32(R8) + MOVOA X7, 48(R8) + MOVOA X8, 64(R8) + MOVOA X9, 80(R8) + MOVOA X10, 96(R8) + MOVOA X11, 112(R8) + ADDQ $0x80, R8 + ADDQ $0x80, BP + ADDQ $0x80, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + // func emitLiteral(dst []byte, lit []byte) int // Requires: SSE2 TEXT ·emitLiteral(SB), NOSPLIT, $0-56 From 0b56de6c6a03a0d002467464ddec744a4734c269 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 19 Feb 2021 10:40:10 +0100 Subject: [PATCH 02/10] Finish compression. --- s2/_generate/gen.go | 273 +- s2/cmd/s2c/main.go | 36 +- s2/encodeblock_amd64.go | 32 +- s2/encodeblock_amd64.s | 7178 ++++++++++++++++++++++----------------- 4 files changed, 4296 insertions(+), 3223 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index af096dca6a..d663225742 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -39,6 +39,11 @@ func main() { o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B) o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B) + // Snappy compatible o.snappy = true o.genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, limit14B) @@ -46,11 +51,6 @@ func main() { o.genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 4, limit10B) o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B) - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B) - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B) - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B) - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B) - o.snappy = false o.maxLen = math.MaxUint32 o.genEmitLiteral() @@ -1145,143 +1145,153 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash RET() } Label("match_dst_size_check_" + name) + + base := GP32() + MOVL(s, base.As32()) + + // s+=4, candidate+=4 + ADDL(U8(4), s) + ADDL(U8(4), candidate) + // Extend the 4-byte match as long as possible and emit copy. { - base := GP32() - MOVL(s, base.As32()) - o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name) - } - cv := GP64() - Label("match_nolit_loop_" + name) - { + assert(func(ok LabelRef) { + // s must be > candidate cannot be equal. + CMPL(s, candidate) + JG(ok) + }) + // srcLeft = len(src) - s + srcLeft := GP64() + MOVQ(lenSrcQ, srcLeft) + SUBL(s, srcLeft.As32()) + assert(func(ok LabelRef) { + // if srcleft < maxint32: ok + CMPQ(srcLeft, U32(0x7fffffff)) + JL(ok) + }) + + a, b := GP64(), GP64() + LEAQ(Mem{Base: src, Index: s, Scale: 1}, a) + LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b) + length := o.matchLen("match_nolit_"+name, + a, b, + srcLeft, + LabelRef("match_nolit_end_"+name), + ) + Label("match_nolit_end_" + name) + assert(func(ok LabelRef) { + CMPL(length.As32(), U32(math.MaxInt32)) + JL(ok) + }) + a, b, srcLeft = nil, nil, nil + // Update repeat { // repeat = base - candidate repeatVal := GP64().As32() MOVL(s, repeatVal) SUBL(candidate, repeatVal) + // Check if match is better.. + if true { + CMPL(length.As32(), U8(1)) + JG(LabelRef("match_length_ok_" + name)) + CMPL(repeatVal, U32(65535)) + JLE(LabelRef("match_length_ok_" + name)) + // Match is equal or worse to the encoding. + MOVL(nextSTempL, s) + INCL(s) + JMP(LabelRef("search_loop_" + name)) + Label("match_length_ok_" + name) + } + // Store updated repeat MOVL(repeatVal, repeatL) } - // s+=4, candidate+=4 - ADDL(U8(4), s) - ADDL(U8(4), candidate) - // Extend the 4-byte match as long as possible and emit copy. - { - assert(func(ok LabelRef) { - // s must be > candidate cannot be equal. - CMPL(s, candidate) - JG(ok) - }) - // srcLeft = len(src) - s - srcLeft := GP64() - MOVQ(lenSrcQ, srcLeft) - SUBL(s, srcLeft.As32()) - assert(func(ok LabelRef) { - // if srcleft < maxint32: ok - CMPQ(srcLeft, U32(0x7fffffff)) - JL(ok) - }) - - a, b := GP64(), GP64() - LEAQ(Mem{Base: src, Index: s, Scale: 1}, a) - LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b) - length := o.matchLen("match_nolit_"+name, - a, b, - srcLeft, - LabelRef("match_nolit_end_"+name), - ) - Label("match_nolit_end_" + name) - assert(func(ok LabelRef) { - CMPL(length.As32(), U32(math.MaxInt32)) - JL(ok) - }) - a, b, srcLeft = nil, nil, nil + // Emit.... + o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name) + // s += length (length is destroyed, use it now) + ADDL(length.As32(), s) - // s += length (length is destroyed, use it now) - ADDL(length.As32(), s) + // Load offset from repeat value. + offset := GP64() + MOVL(repeatL, offset.As32()) - // Load offset from repeat value. - offset := GP64() - MOVL(repeatL, offset.As32()) - - // length += 4 - ADDL(U8(4), length.As32()) - MOVL(s, nextEmitL) // nextEmit = s - o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) - Label("match_nolit_emitcopy_end_" + name) + // length += 4 + ADDL(U8(4), length.As32()) + MOVL(s, nextEmitL) // nextEmit = s + o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) + Label("match_nolit_emitcopy_end_" + name) - // if s >= sLimit { end } - { - CMPL(s.As32(), sLimitL) - JGE(LabelRef("emit_remainder_" + name)) - } - // Start load candidate+1 as early as possible... - // Candidate is + 4 - MOVQ(Mem{Base: src, Index: candidate, Scale: 1, Disp: 1 - 4}, cv) - // Bail if we exceed the maximum size. - { - CMPQ(dst, dstLimitPtrQ) - JL(LabelRef("match_nolit_dst_ok_" + name)) - ri, err := ReturnIndex(0).Resolve() - if err != nil { - panic(err) - } - MOVQ(U32(0), ri.Addr) - RET() + // if s >= sLimit { end } + { + CMPL(s.As32(), sLimitL) + JGE(LabelRef("emit_remainder_" + name)) + } + // Start load candidate+1 as early as possible... + // Bail if we exceed the maximum size. + { + CMPQ(dst, dstLimitPtrQ) + JL(LabelRef("match_nolit_dst_ok_" + name)) + ri, err := ReturnIndex(0).Resolve() + if err != nil { + panic(err) } + MOVQ(U32(0), ri.Addr) + RET() } - Label("match_nolit_dst_ok_" + name) - // cv must be set to value at candidate+1 before arriving here - if true { - lHasher := hashN(lHashBytes, lTableBits) - sHasher := hashN(sHashBytes, sTableBits) - - // Index candidate+1 long, candidate+2 short... - hash0, hash1 := GP64(), GP64() - MOVQ(cv, hash0) // src[candidate+1] - MOVQ(cv, hash1) - SHRQ(U8(8), hash1) // src[candidate+2] - cp1, cp2 := GP32(), GP32() // candidate+1, candidate + 2 - LEAL(Mem{Base: candidate, Disp: 1 - 4}, cp1) - LEAL(Mem{Base: candidate, Disp: 2 - 4}, cp2) - // Load s-2 early - MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv) - - lHasher.hash(hash0) - sHasher.hash(hash1) - - assert(func(ok LabelRef) { - CMPQ(hash0, U32(lTableSize)) - JL(ok) - }) - assert(func(ok LabelRef) { - CMPQ(hash1, U32(sTableSize)) - JL(ok) - }) - MOVL(cp1, lTab.Idx(hash0, 4)) - MOVL(cp2, sTab.Idx(hash1, 4)) + } + Label("match_nolit_dst_ok_" + name) + // cv must be set to value at base+1 before arriving here + if true { + lHasher := hashN(lHashBytes, lTableBits) + sHasher := hashN(sHashBytes, sTableBits) - // Index s-2 long, s-1 short... - MOVQ(cv, hash0) // src[s-2] - MOVQ(cv, hash1) // src[s-1] - SHRQ(U8(8), hash1) - sm1, sm2 := GP32(), GP32() // s -1, s - 2 - LEAL(Mem{Base: s, Disp: -2}, sm2) - LEAL(Mem{Base: s, Disp: -1}, sm1) - lHasher.hash(hash0) - sHasher.hash(hash1) - assert(func(ok LabelRef) { - CMPQ(hash0, U32(lTableSize)) - JL(ok) - }) - assert(func(ok LabelRef) { - CMPQ(hash1, U32(sTableSize)) - JL(ok) - }) - MOVL(sm2, lTab.Idx(hash0, 4)) - MOVL(sm1, sTab.Idx(hash1, 4)) - } - JMP(LabelRef("search_loop_" + name)) + // Index base+1 long, base+2 short... + cv := GP64() + INCL(base) + MOVQ(Mem{Base: src, Index: base, Scale: 1, Disp: 0}, cv) + hash0, hash1 := GP64(), GP64() + MOVQ(cv, hash0) // src[base+1] + MOVQ(cv, hash1) + SHRQ(U8(8), hash1) // src[base+2] + bp1 := GP32() // base+1 + LEAL(Mem{Base: base, Disp: 1}, bp1) + + // Load s-2 early + MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv) + + lHasher.hash(hash0) + sHasher.hash(hash1) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + MOVL(base, lTab.Idx(hash0, 4)) + MOVL(bp1, sTab.Idx(hash1, 4)) + + // Index s-2 long, s-1 short... + MOVQ(cv, hash0) // src[s-2] + MOVQ(cv, hash1) // src[s-1] + SHRQ(U8(8), hash1) + sm1, sm2 := GP32(), GP32() // s -1, s - 2 + LEAL(Mem{Base: s, Disp: -2}, sm2) + LEAL(Mem{Base: s, Disp: -1}, sm1) + lHasher.hash(hash0) + sHasher.hash(hash1) + assert(func(ok LabelRef) { + CMPQ(hash0, U32(lTableSize)) + JL(ok) + }) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(sTableSize)) + JL(ok) + }) + MOVL(sm2, lTab.Idx(hash0, 4)) + MOVL(sm1, sTab.Idx(hash1, 4)) } + JMP(LabelRef("search_loop_" + name)) Label("emit_remainder_" + name) // Bail if we exceed the maximum size. @@ -1322,9 +1332,9 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash // length := start - base (ptr arithmetic) length := GP64() - base := Load(Param("dst").Base(), GP64()) + dstBase := Load(Param("dst").Base(), GP64()) MOVQ(dst, length) - SUBQ(base, length) + SUBQ(dstBase, length) // Assert size is < len(src) assert(func(ok LabelRef) { @@ -1643,6 +1653,7 @@ func (o options) genEmitRepeat() { // Will jump to end label when finished. // Uses 1 GP register. func (o options) emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { + Comment("emitRepeat") Label("emit_repeat_again_" + name) tmp := GP32() MOVL(length.As32(), tmp) // Copy length @@ -1834,6 +1845,8 @@ const ( // Will jump to end label when finished. // Uses 2 GP registers. func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { + Comment("emitCopy") + if o.maxLen >= 65536 { //if offset >= 65536 { CMPL(offset.As32(), U32(65536)) @@ -1979,6 +1992,7 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir // All passed registers may be updated. // Length must be 1 -> 64 bytes func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, end LabelRef) { + Comment("genMemMoveShort") AX, CX := GP64(), GP64() name += "_memmove_" @@ -2070,6 +2084,7 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en // AVX uses 4 GP registers 16 AVX/SSE registers. // All passed registers may be updated. func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end LabelRef) { + Comment("genMemMoveLong") name += "large_" assert(func(ok LabelRef) { @@ -2342,6 +2357,7 @@ func (o options) genMatchLen() { // Will jump to end when done and returns the length. // Uses 2 GP registers. func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { + Comment("matchLen") if false { return o.matchLenAlt(name, a, b, len, end) } @@ -2390,6 +2406,7 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re // Uses 3 GP registers. // It is better on longer matches. func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { + Comment("matchLenAlt") tmp, tmp2, matched := GP64(), GP64(), GP32() XORL(matched, matched) diff --git a/s2/cmd/s2c/main.go b/s2/cmd/s2c/main.go index 7bcef824d9..90c128b7fa 100644 --- a/s2/cmd/s2c/main.go +++ b/s2/cmd/s2c/main.go @@ -36,6 +36,7 @@ var ( remove = flag.Bool("rm", false, "Delete source file(s) after successful compression") quiet = flag.Bool("q", false, "Don't write any output to terminal, except errors") bench = flag.Int("bench", 0, "Run benchmark n times. No output will be written") + verify = flag.Bool("verify", false, "Verify written files") help = flag.Bool("help", false, "Display help") cpuprofile, memprofile, traceprofile string @@ -152,7 +153,8 @@ Options:`) exitErr(err) file.Close() for i := 0; i < *bench; i++ { - wc := wCounter{out: ioutil.Discard} + w, errFn := verifyTo(ioutil.Discard) + wc := wCounter{out: w} if !*quiet { fmt.Print("\nCompressing...") } @@ -170,6 +172,7 @@ Options:`) ms := elapsed.Round(time.Millisecond) fmt.Printf(" %d -> %d [%.02f%%]; %v, %.01fMB/s", input, wc.n, pct, ms, mbpersec) } + exitErr(errFn()) } fmt.Println("") wr.Close() @@ -218,6 +221,7 @@ Options:`) defer bw.Flush() out = bw } + out, errFn := verifyTo(ioutil.Discard) wc := wCounter{out: out} wr.Reset(&wc) defer wr.Close() @@ -232,6 +236,7 @@ Options:`) pct := float64(wc.n) * 100 / float64(input) fmt.Printf(" %d -> %d [%.02f%%]; %.01fMB/s\n", input, wc.n, pct, mbpersec) } + exitErr(errFn()) if *remove { closeOnce.Do(func() { file.Close() @@ -246,6 +251,35 @@ Options:`) } } +func verifyTo(w io.Writer) (io.Writer, func() error) { + if !*verify { + return w, func() error { + return nil + } + } + pr, pw := io.Pipe() + writer := io.MultiWriter(w, pw) + var wg sync.WaitGroup + var err error + wg.Add(1) + go func() { + defer wg.Done() + r := s2.NewReader(pr) + _, err = io.Copy(ioutil.Discard, r) + pr.CloseWithError(fmt.Errorf("verify: %w", err)) + }() + return writer, func() error { + pw.Close() + wg.Wait() + if err == nil { + if !*quiet { + fmt.Print("... Verified ok.") + } + } + return err + } +} + func printErr(err error) { if err != nil { fmt.Fprintln(os.Stderr, "\nERROR:", err.Error()) diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index b9c89f2325..cb04bdd295 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -34,61 +34,61 @@ func encodeBlockAsm10B(dst []byte, src []byte) int //go:noescape func encodeBlockAsm8B(dst []byte, src []byte) int -// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4294967295 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeSnappyBlockAsm(dst []byte, src []byte) int +func encodeBetterBlockAsm(dst []byte, src []byte) int -// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 16383 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeSnappyBlockAsm12B(dst []byte, src []byte) int +func encodeBetterBlockAsm12B(dst []byte, src []byte) int -// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4095 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeSnappyBlockAsm10B(dst []byte, src []byte) int +func encodeBetterBlockAsm10B(dst []byte, src []byte) int -// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 511 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeSnappyBlockAsm8B(dst []byte, src []byte) int +func encodeBetterBlockAsm8B(dst []byte, src []byte) int -// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4294967295 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeBetterBlockAsm(dst []byte, src []byte) int +func encodeSnappyBlockAsm(dst []byte, src []byte) int -// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. +// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 16383 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeBetterBlockAsm12B(dst []byte, src []byte) int +func encodeSnappyBlockAsm12B(dst []byte, src []byte) int -// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. +// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4095 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeBetterBlockAsm10B(dst []byte, src []byte) int +func encodeSnappyBlockAsm10B(dst []byte, src []byte) int -// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. +// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 511 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. // //go:noescape -func encodeBetterBlockAsm8B(dst []byte, src []byte) int +func encodeSnappyBlockAsm8B(dst []byte, src []byte) int // emitLiteral writes a literal chunk and returns the number of bytes written. // diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 9edb7009d0..c78bace185 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -144,6 +144,8 @@ one_byte_repeat_emit_encodeBlockAsm: memmove_repeat_emit_encodeBlockAsm: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3 @@ -205,7 +207,9 @@ memmove_end_copy_repeat_emit_encodeBlockAsm: JMP emit_literal_done_repeat_emit_encodeBlockAsm memmove_long_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -266,6 +270,8 @@ emit_literal_done_repeat_emit_encodeBlockAsm: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm @@ -306,6 +312,7 @@ repeat_extend_forward_end_encodeBlockAsm: TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm + // emitRepeat emit_repeat_again_match_repeat_encodeBlockAsm: MOVL BP, DI LEAL -4(BP), BP @@ -373,6 +380,7 @@ repeat_two_offset_match_repeat_encodeBlockAsm: JMP repeat_end_emit_encodeBlockAsm repeat_as_copy_encodeBlockAsm: + // emitCopy CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm @@ -386,6 +394,7 @@ four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm + // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: MOVL BP, DI LEAL -4(BP), BP @@ -471,6 +480,7 @@ two_byte_offset_repeat_as_copy_encodeBlockAsm: LEAL -60(BP), BP ADDQ $0x03, AX + // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: MOVL BP, DI LEAL -4(BP), BP @@ -667,6 +677,8 @@ one_byte_match_emit_encodeBlockAsm: memmove_match_emit_encodeBlockAsm: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3 @@ -728,7 +740,9 @@ memmove_end_copy_match_emit_encodeBlockAsm: JMP emit_literal_done_match_emit_encodeBlockAsm memmove_long_match_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -792,6 +806,8 @@ match_nolit_loop_encodeBlockAsm: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm @@ -829,6 +845,8 @@ match_nolit_end_encodeBlockAsm: MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) + + // emitCopy CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm @@ -842,6 +860,7 @@ four_bytes_loop_back_match_nolit_encodeBlockAsm: CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm + // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: MOVL R9, SI LEAL -4(R9), R9 @@ -927,6 +946,7 @@ two_byte_offset_match_nolit_encodeBlockAsm: LEAL -60(R9), R9 ADDQ $0x03, AX + // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: MOVL R9, SI LEAL -4(R9), R9 @@ -1109,6 +1129,8 @@ one_byte_emit_remainder_encodeBlockAsm: memmove_emit_remainder_encodeBlockAsm: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 @@ -1170,8 +1192,10 @@ memmove_end_copy_emit_remainder_encodeBlockAsm: JMP emit_literal_done_emit_remainder_encodeBlockAsm memmove_long_emit_remainder_encodeBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -1349,6 +1373,8 @@ one_byte_repeat_emit_encodeBlockAsm12B: memmove_repeat_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 @@ -1410,7 +1436,9 @@ memmove_end_copy_repeat_emit_encodeBlockAsm12B: JMP emit_literal_done_repeat_emit_encodeBlockAsm12B memmove_long_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -1471,6 +1499,8 @@ emit_literal_done_repeat_emit_encodeBlockAsm12B: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm12B @@ -1510,14 +1540,16 @@ repeat_extend_forward_end_encodeBlockAsm12B: MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm12B - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm12B - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm12B + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm12B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm12B cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: CMPL BP, $0x00000104 @@ -1554,6 +1586,7 @@ repeat_two_offset_match_repeat_encodeBlockAsm12B: JMP repeat_end_emit_encodeBlockAsm12B repeat_as_copy_encodeBlockAsm12B: + // emitCopy two_byte_offset_repeat_as_copy_encodeBlockAsm12B: CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B @@ -1561,6 +1594,8 @@ two_byte_offset_repeat_as_copy_encodeBlockAsm12B: MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX + + // emitRepeat MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 @@ -1715,6 +1750,8 @@ one_byte_match_emit_encodeBlockAsm12B: memmove_match_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 @@ -1776,7 +1813,9 @@ memmove_end_copy_match_emit_encodeBlockAsm12B: JMP emit_literal_done_match_emit_encodeBlockAsm12B memmove_long_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -1840,6 +1879,8 @@ match_nolit_loop_encodeBlockAsm12B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm12B @@ -1878,6 +1919,7 @@ match_nolit_end_encodeBlockAsm12B: ADDL $0x04, R9 MOVL CX, 12(SP) + // emitCopy two_byte_offset_match_nolit_encodeBlockAsm12B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B @@ -1885,6 +1927,8 @@ two_byte_offset_match_nolit_encodeBlockAsm12B: MOVW BP, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX + + // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 @@ -2025,6 +2069,8 @@ one_byte_emit_remainder_encodeBlockAsm12B: memmove_emit_remainder_encodeBlockAsm12B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 @@ -2086,8 +2132,10 @@ memmove_end_copy_emit_remainder_encodeBlockAsm12B: JMP emit_literal_done_emit_remainder_encodeBlockAsm12B memmove_long_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -2265,6 +2313,8 @@ one_byte_repeat_emit_encodeBlockAsm10B: memmove_repeat_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3 @@ -2326,7 +2376,9 @@ memmove_end_copy_repeat_emit_encodeBlockAsm10B: JMP emit_literal_done_repeat_emit_encodeBlockAsm10B memmove_long_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -2387,6 +2439,8 @@ emit_literal_done_repeat_emit_encodeBlockAsm10B: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm10B @@ -2426,14 +2480,16 @@ repeat_extend_forward_end_encodeBlockAsm10B: MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm10B - MOVL BP, DI - LEAL -4(BP), BP - CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm10B - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL SI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm10B + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm10B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm10B cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: CMPL BP, $0x00000104 @@ -2470,6 +2526,7 @@ repeat_two_offset_match_repeat_encodeBlockAsm10B: JMP repeat_end_emit_encodeBlockAsm10B repeat_as_copy_encodeBlockAsm10B: + // emitCopy two_byte_offset_repeat_as_copy_encodeBlockAsm10B: CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B @@ -2477,6 +2534,8 @@ two_byte_offset_repeat_as_copy_encodeBlockAsm10B: MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX + + // emitRepeat MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 @@ -2631,6 +2690,8 @@ one_byte_match_emit_encodeBlockAsm10B: memmove_match_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3 @@ -2692,7 +2753,9 @@ memmove_end_copy_match_emit_encodeBlockAsm10B: JMP emit_literal_done_match_emit_encodeBlockAsm10B memmove_long_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -2756,6 +2819,8 @@ match_nolit_loop_encodeBlockAsm10B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm10B @@ -2794,6 +2859,7 @@ match_nolit_end_encodeBlockAsm10B: ADDL $0x04, R9 MOVL CX, 12(SP) + // emitCopy two_byte_offset_match_nolit_encodeBlockAsm10B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B @@ -2801,6 +2867,8 @@ two_byte_offset_match_nolit_encodeBlockAsm10B: MOVW BP, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX + + // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 @@ -2941,6 +3009,8 @@ one_byte_emit_remainder_encodeBlockAsm10B: memmove_emit_remainder_encodeBlockAsm10B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 @@ -3002,8 +3072,10 @@ memmove_end_copy_emit_remainder_encodeBlockAsm10B: JMP emit_literal_done_emit_remainder_encodeBlockAsm10B memmove_long_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -3181,6 +3253,8 @@ one_byte_repeat_emit_encodeBlockAsm8B: memmove_repeat_emit_encodeBlockAsm8B: LEAQ (AX)(R8*1), BP + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3 @@ -3242,7 +3316,9 @@ memmove_end_copy_repeat_emit_encodeBlockAsm8B: JMP emit_literal_done_repeat_emit_encodeBlockAsm8B memmove_long_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), BP + LEAQ (AX)(R8*1), BP + + // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -3303,6 +3379,8 @@ emit_literal_done_repeat_emit_encodeBlockAsm8B: SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP + + // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_single_repeat_extend_encodeBlockAsm8B @@ -3342,12 +3420,14 @@ repeat_extend_forward_end_encodeBlockAsm8B: MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm8B - MOVL BP, SI - LEAL -4(BP), BP - CMPL SI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm8B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B + + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm8B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: CMPL BP, $0x00000104 @@ -3382,6 +3462,7 @@ repeat_two_match_repeat_encodeBlockAsm8B: JMP repeat_end_emit_encodeBlockAsm8B repeat_as_copy_encodeBlockAsm8B: + // emitCopy two_byte_offset_repeat_as_copy_encodeBlockAsm8B: CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B @@ -3389,6 +3470,8 @@ two_byte_offset_repeat_as_copy_encodeBlockAsm8B: MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX + + // emitRepeat MOVL BP, SI LEAL -4(BP), BP CMPL SI, $0x08 @@ -3537,6 +3620,8 @@ one_byte_match_emit_encodeBlockAsm8B: memmove_match_emit_encodeBlockAsm8B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3 @@ -3598,7 +3683,9 @@ memmove_end_copy_match_emit_encodeBlockAsm8B: JMP emit_literal_done_match_emit_encodeBlockAsm8B memmove_long_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), DI + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -3662,6 +3749,8 @@ match_nolit_loop_encodeBlockAsm8B: SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BP*1), BP + + // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm8B @@ -3700,6 +3789,7 @@ match_nolit_end_encodeBlockAsm8B: ADDL $0x04, R9 MOVL CX, 12(SP) + // emitCopy two_byte_offset_match_nolit_encodeBlockAsm8B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B @@ -3707,6 +3797,8 @@ two_byte_offset_match_nolit_encodeBlockAsm8B: MOVW BP, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX + + // emitRepeat MOVL R9, BP LEAL -4(R9), R9 CMPL BP, $0x08 @@ -3841,6 +3933,8 @@ one_byte_emit_remainder_encodeBlockAsm8B: memmove_emit_remainder_encodeBlockAsm8B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 @@ -3902,8 +3996,10 @@ memmove_end_copy_emit_remainder_encodeBlockAsm8B: JMP emit_literal_done_emit_remainder_encodeBlockAsm8B memmove_long_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -3962,15 +4058,15 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B: MOVQ AX, ret+48(FP) RET -// func encodeSnappyBlockAsm(dst []byte, src []byte) int +// func encodeBetterBlockAsm(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeSnappyBlockAsm(SB), $65560-56 +TEXT ·encodeBetterBlockAsm(SB), $327704-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX + MOVQ $0x00000a00, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeSnappyBlockAsm: +zero_loop_encodeBetterBlockAsm: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -3981,7 +4077,7 @@ zero_loop_encodeSnappyBlockAsm: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm + JNZ zero_loop_encodeBetterBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -3995,308 +4091,514 @@ zero_loop_encodeSnappyBlockAsm: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeSnappyBlockAsm: +search_loop_encodeBetterBlockAsm: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x06, BP - LEAL 4(CX)(BP*1), BP + SHRL $0x07, BP + LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm + JGE emit_remainder_encodeBetterBlockAsm MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ $0x9e3779b1, BP MOVQ SI, R9 MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 + SHLQ $0x08, R9 IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x10, R10 - IMULQ R8, R10 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI + MOVL 262168(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) + MOVL CX, 262168(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm -repeat_extend_back_loop_encodeSnappyBlockAsm: - CMPL SI, BP - JLE repeat_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(DI*1), BL +repeat_extend_back_loop_encodeBetterBlockAsm: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeSnappyBlockAsm + JNE repeat_extend_back_end_encodeBetterBlockAsm LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm -repeat_extend_back_end_encodeSnappyBlockAsm: +repeat_extend_back_end_encodeBetterBlockAsm: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm - MOVL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm + MOVL SI, R8 MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm + JLT one_byte_repeat_emit_encodeBetterBlockAsm CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm + JLT two_bytes_repeat_emit_encodeBetterBlockAsm CMPL BP, $0x00010000 - JLT three_bytes_repeat_emit_encodeSnappyBlockAsm + JLT three_bytes_repeat_emit_encodeBetterBlockAsm CMPL BP, $0x01000000 - JLT four_bytes_repeat_emit_encodeSnappyBlockAsm + JLT four_bytes_repeat_emit_encodeBetterBlockAsm MOVB $0xfc, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm -four_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVL BP, R9 - SHRL $0x10, R9 +four_bytes_repeat_emit_encodeBetterBlockAsm: + MOVL BP, R10 + SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R9, 3(AX) + MOVB R10, 3(AX) ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm -three_bytes_repeat_emit_encodeSnappyBlockAsm: +three_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm -two_bytes_repeat_emit_encodeSnappyBlockAsm: +two_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm + JL memmove_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm -one_byte_repeat_emit_encodeSnappyBlockAsm: +one_byte_repeat_emit_encodeBetterBlockAsm: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(DI*1), BP - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 +memmove_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: +memmove_end_copy_repeat_emit_encodeBetterBlockAsm: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm -memmove_long_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(DI*1), BP - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x07, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 +memmove_long_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x07, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOU 32(R10), X6 + MOVOU 48(R10), X7 + MOVOU 64(R10), X8 + MOVOU 80(R10), X9 + MOVOU 96(R10), X10 + MOVOU 112(R10), X11 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 + ADDQ $0x80, R10 ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeSnappyBlockAsm: +emit_literal_done_repeat_emit_encodeBetterBlockAsm: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm -matchlen_loop_repeat_extend_encodeSnappyBlockAsm: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm +matchlen_loopback_repeat_extend_encodeBetterBlockAsm: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm -matchlen_single_repeat_extend_encodeSnappyBlockAsm: +matchlen_loop_repeat_extend_encodeBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm + +matchlen_single_repeat_extend_encodeBetterBlockAsm: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm + +repeat_extend_forward_end_encodeBetterBlockAsm: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI TESTL DI, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm + JZ repeat_as_copy_encodeBetterBlockAsm -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm + // emitRepeat +emit_repeat_again_match_repeat_encodeBetterBlockAsm: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm -repeat_extend_forward_end_encodeSnappyBlockAsm: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI - CMPL SI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm + CMPL BP, $0x00010100 + JLT repeat_four_match_repeat_encodeBetterBlockAsm + CMPL BP, $0x0100ffff + JLT repeat_five_match_repeat_encodeBetterBlockAsm + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_repeat_encodeBetterBlockAsm -four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BP, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(BP), BP +repeat_five_match_repeat_encodeBetterBlockAsm: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX - CMPL BP, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm -four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: - TESTL BP, BP - JZ repeat_end_emit_encodeSnappyBlockAsm - MOVB $0x03, BL - LEAL -4(BX)(BP*4), BP - MOVB BP, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeSnappyBlockAsm +repeat_four_match_repeat_encodeBetterBlockAsm: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BP), BP - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm +repeat_three_match_repeat_encodeBetterBlockAsm: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: +repeat_two_match_repeat_encodeBetterBlockAsm: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_match_repeat_encodeBetterBlockAsm: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_as_copy_encodeBetterBlockAsm: + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(BP), BP + ADDQ $0x05, AX + CMPL BP, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy + +repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm: + TESTL BP, BP + JZ repeat_end_emit_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + +repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -4305,200 +4607,255 @@ two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeSnappyBlockAsm: +repeat_end_emit_encodeBetterBlockAsm: MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm + JMP search_loop_encodeBetterBlockAsm -no_repeat_found_encodeSnappyBlockAsm: +no_repeat_found_encodeBetterBlockAsm: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeSnappyBlockAsm - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 + JEQ candidate_match_encodeBetterBlockAsm CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm + JEQ candidateS_match_encodeBetterBlockAsm MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm - -candidate3_match_encodeSnappyBlockAsm: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm + JMP search_loop_encodeBetterBlockAsm -candidate2_match_encodeSnappyBlockAsm: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BP +candidateS_match_encodeBetterBlockAsm: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm + DECL CX + MOVL DI, BP -candidate_match_encodeSnappyBlockAsm: +candidate_match_encodeBetterBlockAsm: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeSnappyBlockAsm + JZ match_extend_back_end_encodeBetterBlockAsm -match_extend_back_loop_encodeSnappyBlockAsm: +match_extend_back_loop_encodeBetterBlockAsm: CMPL CX, SI - JLE match_extend_back_end_encodeSnappyBlockAsm + JLE match_extend_back_end_encodeBetterBlockAsm MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeSnappyBlockAsm + JNE match_extend_back_end_encodeBetterBlockAsm LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeSnappyBlockAsm - JMP match_extend_back_loop_encodeSnappyBlockAsm + JZ match_extend_back_end_encodeBetterBlockAsm + JMP match_extend_back_loop_encodeBetterBlockAsm -match_extend_back_end_encodeSnappyBlockAsm: +match_extend_back_end_encodeBetterBlockAsm: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm + JL match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeSnappyBlockAsm: +match_dst_size_check_encodeBetterBlockAsm: MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x00010000 - JLT three_bytes_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x01000000 - JLT four_bytes_match_emit_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 -four_bytes_match_emit_encodeSnappyBlockAsm: - MOVL DI, R9 - SHRL $0x10, R9 - MOVB $0xf8, (AX) - MOVW DI, 1(AX) + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm + +matchlen_loopback_match_nolit_encodeBetterBlockAsm: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm + +matchlen_loop_match_nolit_encodeBetterBlockAsm: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + +matchlen_single_match_nolit_encodeBetterBlockAsm: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + +match_nolit_end_encodeBetterBlockAsm: + MOVL CX, DI + SUBL BP, DI + CMPL R11, $0x01 + JG match_length_ok_encodeBetterBlockAsm + CMPL DI, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm + +match_length_ok_encodeBetterBlockAsm: + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm + CMPL BP, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm + CMPL BP, $0x01000000 + JLT four_bytes_match_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +four_bytes_match_emit_encodeBetterBlockAsm: + MOVL BP, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm -three_bytes_match_emit_encodeSnappyBlockAsm: +three_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm -two_bytes_match_emit_encodeSnappyBlockAsm: +two_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB BP, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm - JMP memmove_long_match_emit_encodeSnappyBlockAsm + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm -one_byte_match_emit_encodeSnappyBlockAsm: - SHLB $0x02, DI - MOVB DI, (AX) +one_byte_match_emit_encodeBetterBlockAsm: + SHLB $0x02, BP + MOVB BP, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), DI - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 +memmove_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_match_emit_encodeSnappyBlockAsm: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm +memmove_end_copy_match_emit_encodeBetterBlockAsm: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm -memmove_long_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), DI - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 +memmove_long_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 SHRQ $0x07, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 + MOVQ $0x00000040, R12 + SUBQ R9, R12 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -4507,277 +4864,383 @@ emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU 80(R9), X9 MOVOU 96(R9), X10 MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x80, R12 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX -emit_literal_done_match_emit_encodeSnappyBlockAsm: -match_nolit_loop_encodeSnappyBlockAsm: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP - XORL R9, R9 - CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm - -matchlen_loop_match_nolit_encodeSnappyBlockAsm: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm - -matchlen_single_match_nolit_encodeSnappyBlockAsm: - TESTL SI, SI - JZ match_nolit_end_encodeSnappyBlockAsm - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm - LEAL 1(R9), R9 - DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm - -match_nolit_end_encodeSnappyBlockAsm: - ADDL R9, CX +emit_literal_done_match_emit_encodeBetterBlockAsm: + ADDL R11, CX MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL $0x04, R11 MOVL CX, 12(SP) + + // emitCopy CMPL BP, $0x00010000 - JL two_byte_offset_match_nolit_encodeSnappyBlockAsm + JL two_byte_offset_match_nolit_encodeBetterBlockAsm -four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x40 - JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(R9), R9 + LEAL -64(R11), R11 ADDQ $0x05, AX - CMPL R9, $0x04 - JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm + CMPL R11, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm -four_bytes_remain_match_nolit_encodeSnappyBlockAsm: - TESTL R9, R9 - JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R11, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy + LEAL -16842747(R11), R11 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + +four_bytes_remain_match_nolit_encodeBetterBlockAsm: + TESTL R11, R11 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm MOVB $0x03, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm -two_byte_offset_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm +two_byte_offset_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(R9), R9 + LEAL -60(R11), R11 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R11, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(R11), R11 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 + LEAL -16(BX)(R11*4), R11 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + ORL BP, R11 + MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm -emit_copy_three_match_nolit_encodeSnappyBlockAsm: +emit_copy_three_match_nolit_encodeBetterBlockAsm: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeSnappyBlockAsm: +match_nolit_emitcopy_end_encodeBetterBlockAsm: CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm - MOVQ -2(DX)(CX*1), SI + JGE emit_remainder_encodeBetterBlockAsm CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm + JL match_nolit_dst_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeSnappyBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BP - IMULQ R8, BP - SHRQ $0x32, BP +match_nolit_dst_ok_encodeBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 262168(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm - INCL CX - JMP search_loop_encodeSnappyBlockAsm + LEAL -1(CX), SI + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 262168(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm -emit_remainder_encodeSnappyBlockAsm: +emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm + JL emit_remainder_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeSnappyBlockAsm: +emit_remainder_ok_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm + JLT one_byte_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm + JLT two_bytes_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeSnappyBlockAsm + JLT three_bytes_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeSnappyBlockAsm + JLT four_bytes_emit_remainder_encodeBetterBlockAsm MOVB $0xfc, (AX) MOVL DX, 1(AX) ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm -four_bytes_emit_remainder_encodeSnappyBlockAsm: +four_bytes_emit_remainder_encodeBetterBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm -three_bytes_emit_remainder_encodeSnappyBlockAsm: +three_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm -two_bytes_emit_remainder_encodeSnappyBlockAsm: +two_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm + JL memmove_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm -one_byte_emit_remainder_encodeSnappyBlockAsm: +one_byte_emit_remainder_encodeBetterBlockAsm: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeSnappyBlockAsm: +memmove_emit_remainder_encodeBetterBlockAsm: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -4787,13 +5250,15 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: +memmove_end_copy_emit_remainder_encodeBetterBlockAsm: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm -memmove_long_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeBetterBlockAsm: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -4805,11 +5270,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -4830,37 +5295,37 @@ emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeSnappyBlockAsm: +emit_literal_done_emit_remainder_encodeBetterBlockAsm: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int +// func encodeBetterBlockAsm12B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 +TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX + MOVQ $0x00000280, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeSnappyBlockAsm12B: +zero_loop_encodeBetterBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -4871,7 +5336,7 @@ zero_loop_encodeSnappyBlockAsm12B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm12B + JNZ zero_loop_encodeBetterBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -4885,266 +5350,358 @@ zero_loop_encodeSnappyBlockAsm12B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeSnappyBlockAsm12B: +search_loop_encodeBetterBlockAsm12B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x05, BP - LEAL 4(CX)(BP*1), BP + SHRL $0x06, BP + LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm12B + JGE emit_remainder_encodeBetterBlockAsm12B MOVL BP, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP MOVQ SI, R9 MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x18, R9 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x18, R10 - IMULQ R8, R10 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI + MOVL 65560(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) + MOVL CX, 65560(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm12B + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm12B LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm12B + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm12B -repeat_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL SI, BP - JLE repeat_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(DI*1), BL +repeat_extend_back_loop_encodeBetterBlockAsm12B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeSnappyBlockAsm12B + JNE repeat_extend_back_end_encodeBetterBlockAsm12B LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm12B -repeat_extend_back_end_encodeSnappyBlockAsm12B: +repeat_extend_back_end_encodeBetterBlockAsm12B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - MOVL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + MOVL SI, R8 MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B + JLT one_byte_repeat_emit_encodeBetterBlockAsm12B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B + JLT two_bytes_repeat_emit_encodeBetterBlockAsm12B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B -two_bytes_repeat_emit_encodeSnappyBlockAsm12B: +two_bytes_repeat_emit_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm12B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B + JL memmove_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B -one_byte_repeat_emit_encodeSnappyBlockAsm12B: +one_byte_repeat_emit_encodeBetterBlockAsm12B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(DI*1), BP - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 +memmove_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: +memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm12B -memmove_long_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(DI*1), BP - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x07, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 +memmove_long_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x07, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOU 32(R10), X6 + MOVOU 48(R10), X7 + MOVOU 64(R10), X8 + MOVOU 80(R10), X9 + MOVOU 96(R10), X10 + MOVOU 112(R10), X11 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 + ADDQ $0x80, R10 ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: +emit_literal_done_repeat_emit_encodeBetterBlockAsm12B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm12B -matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B +matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm12B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm12B -matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: +matchlen_loop_repeat_extend_encodeBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B + +matchlen_single_repeat_extend_encodeBetterBlockAsm12B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm12B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B + +repeat_extend_forward_end_encodeBetterBlockAsm12B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI TESTL DI, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B + JZ repeat_as_copy_encodeBetterBlockAsm12B -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm12B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm12B -repeat_extend_forward_end_encodeSnappyBlockAsm12B: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm12B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: +repeat_three_match_repeat_encodeBetterBlockAsm12B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_match_repeat_encodeBetterBlockAsm12B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_as_copy_encodeBetterBlockAsm12B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_three_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm12B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -5153,418 +5710,488 @@ two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm12B + JMP repeat_end_emit_encodeBetterBlockAsm12B -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeSnappyBlockAsm12B: +repeat_end_emit_encodeBetterBlockAsm12B: MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm12B + JMP search_loop_encodeBetterBlockAsm12B -no_repeat_found_encodeSnappyBlockAsm12B: +no_repeat_found_encodeBetterBlockAsm12B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeSnappyBlockAsm12B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 + JEQ candidate_match_encodeBetterBlockAsm12B CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm12B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm12B + JEQ candidateS_match_encodeBetterBlockAsm12B MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm12B + JMP search_loop_encodeBetterBlockAsm12B -candidate3_match_encodeSnappyBlockAsm12B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm12B +candidateS_match_encodeBetterBlockAsm12B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm12B + DECL CX + MOVL DI, BP -candidate2_match_encodeSnappyBlockAsm12B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BP - -candidate_match_encodeSnappyBlockAsm12B: +candidate_match_encodeBetterBlockAsm12B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeSnappyBlockAsm12B + JZ match_extend_back_end_encodeBetterBlockAsm12B -match_extend_back_loop_encodeSnappyBlockAsm12B: +match_extend_back_loop_encodeBetterBlockAsm12B: CMPL CX, SI - JLE match_extend_back_end_encodeSnappyBlockAsm12B + JLE match_extend_back_end_encodeBetterBlockAsm12B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeSnappyBlockAsm12B + JNE match_extend_back_end_encodeBetterBlockAsm12B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeSnappyBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBlockAsm12B + JZ match_extend_back_end_encodeBetterBlockAsm12B + JMP match_extend_back_loop_encodeBetterBlockAsm12B -match_extend_back_end_encodeSnappyBlockAsm12B: +match_extend_back_end_encodeBetterBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm12B + JL match_dst_size_check_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeSnappyBlockAsm12B: +match_dst_size_check_encodeBetterBlockAsm12B: MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B - MOVL SI, R8 + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm12B + +matchlen_loop_match_nolit_encodeBetterBlockAsm12B: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B + +matchlen_single_match_nolit_encodeBetterBlockAsm12B: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm12B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm12B + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B + +match_nolit_end_encodeBetterBlockAsm12B: + MOVL CX, DI + SUBL BP, DI + CMPL R11, $0x01 + JG match_length_ok_encodeBetterBlockAsm12B + CMPL DI, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm12B + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm12B + +match_length_ok_encodeBetterBlockAsm12B: + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B + MOVL SI, DI MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm12B - CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm12B + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm12B + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm12B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B + JMP memmove_long_match_emit_encodeBetterBlockAsm12B -two_bytes_match_emit_encodeSnappyBlockAsm12B: +two_bytes_match_emit_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB BP, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_encodeBetterBlockAsm12B -one_byte_match_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, DI - MOVB DI, (AX) +one_byte_match_emit_encodeBetterBlockAsm12B: + SHLB $0x02, BP + MOVB BP, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), DI - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 +memmove_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(DI*1), BP -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B - -memmove_long_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), DI - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x07, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 +emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm12B: -match_nolit_loop_encodeSnappyBlockAsm12B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP - XORL R9, R9 - CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) -matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm12B +memmove_end_copy_match_emit_encodeBetterBlockAsm12B: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B -matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B +memmove_long_match_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(DI*1), BP -matchlen_single_match_nolit_encodeSnappyBlockAsm12B: - TESTL SI, SI - JZ match_nolit_end_encodeSnappyBlockAsm12B + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x07, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm12B - LEAL 1(R9), R9 - DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOU 32(R9), X6 + MOVOU 48(R9), X7 + MOVOU 64(R9), X8 + MOVOU 80(R9), X9 + MOVOU 96(R9), X10 + MOVOU 112(R9), X11 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 + ADDQ $0x80, R9 + ADDQ $0x80, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back -match_nolit_end_encodeSnappyBlockAsm12B: - ADDL R9, CX +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm12B: + ADDL R11, CX MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL $0x04, R11 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: - CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm12B: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(R9), R9 + LEAL -60(R11), R11 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: - CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B + // emitRepeat + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 + LEAL -16(BX)(R11*4), R11 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + ORL BP, R11 + MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B -emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: +emit_copy_three_match_nolit_encodeBetterBlockAsm12B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: +match_nolit_emitcopy_end_encodeBetterBlockAsm12B: CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm12B - MOVQ -2(DX)(CX*1), SI + JGE emit_remainder_encodeBetterBlockAsm12B CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm12B + JL match_nolit_dst_ok_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeSnappyBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x18, DI - IMULQ R8, DI - SHRQ $0x34, DI - SHLQ $0x18, BP - IMULQ R8, BP - SHRQ $0x34, BP +match_nolit_dst_ok_encodeBetterBlockAsm12B: + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x34, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 65560(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm12B - INCL CX - JMP search_loop_encodeSnappyBlockAsm12B + LEAL -1(CX), SI + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x32, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x34, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 65560(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm12B -emit_remainder_encodeSnappyBlockAsm12B: +emit_remainder_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm12B + JL emit_remainder_ok_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeSnappyBlockAsm12B: +emit_remainder_ok_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B + JLT one_byte_emit_remainder_encodeBetterBlockAsm12B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B + JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B -two_bytes_emit_remainder_encodeSnappyBlockAsm12B: +two_bytes_emit_remainder_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B + JL memmove_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B -one_byte_emit_remainder_encodeSnappyBlockAsm12B: +one_byte_emit_remainder_encodeBetterBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeSnappyBlockAsm12B: +memmove_emit_remainder_encodeBetterBlockAsm12B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -5574,13 +6201,15 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: +memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B -memmove_long_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeBetterBlockAsm12B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -5592,11 +6221,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm12B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -5617,37 +6246,37 @@ emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: +emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int +// func encodeBetterBlockAsm10B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 +TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX + MOVQ $0x000000a0, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeSnappyBlockAsm10B: +zero_loop_encodeBetterBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -5658,7 +6287,7 @@ zero_loop_encodeSnappyBlockAsm10B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm10B + JNZ zero_loop_encodeBetterBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -5672,266 +6301,358 @@ zero_loop_encodeSnappyBlockAsm10B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeSnappyBlockAsm10B: +search_loop_encodeBetterBlockAsm10B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP - LEAL 4(CX)(BP*1), BP + LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm10B + JGE emit_remainder_encodeBetterBlockAsm10B MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP MOVQ SI, R9 MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x36, R9 + SHRQ $0x34, R9 SHLQ $0x20, R10 - IMULQ R8, R10 + IMULQ BP, R10 SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI + MOVL 16408(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm10B + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm10B LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm10B + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm10B -repeat_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL SI, BP - JLE repeat_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(DI*1), BL +repeat_extend_back_loop_encodeBetterBlockAsm10B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeSnappyBlockAsm10B + JNE repeat_extend_back_end_encodeBetterBlockAsm10B LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm10B -repeat_extend_back_end_encodeSnappyBlockAsm10B: +repeat_extend_back_end_encodeBetterBlockAsm10B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - MOVL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + MOVL SI, R8 MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B + JLT one_byte_repeat_emit_encodeBetterBlockAsm10B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B + JLT two_bytes_repeat_emit_encodeBetterBlockAsm10B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B -two_bytes_repeat_emit_encodeSnappyBlockAsm10B: +two_bytes_repeat_emit_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm10B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B + JL memmove_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B -one_byte_repeat_emit_encodeSnappyBlockAsm10B: +one_byte_repeat_emit_encodeBetterBlockAsm10B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(DI*1), BP - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 +memmove_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: +memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm10B -memmove_long_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(DI*1), BP - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x07, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 +memmove_long_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x07, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOU 32(R10), X6 + MOVOU 48(R10), X7 + MOVOU 64(R10), X8 + MOVOU 80(R10), X9 + MOVOU 96(R10), X10 + MOVOU 112(R10), X11 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 + ADDQ $0x80, R10 ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: +emit_literal_done_repeat_emit_encodeBetterBlockAsm10B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 LEAQ (DX)(BP*1), BP - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm10B -matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B +matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm10B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm10B -matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: +matchlen_loop_repeat_extend_encodeBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B + +matchlen_single_repeat_extend_encodeBetterBlockAsm10B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm10B + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm10B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B + +repeat_extend_forward_end_encodeBetterBlockAsm10B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI TESTL DI, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B + JZ repeat_as_copy_encodeBetterBlockAsm10B -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm10B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm10B -repeat_extend_forward_end_encodeSnappyBlockAsm10B: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm10B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: +repeat_three_match_repeat_encodeBetterBlockAsm10B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_match_repeat_encodeBetterBlockAsm10B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_as_copy_encodeBetterBlockAsm10B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_three_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm10B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -5940,181 +6661,236 @@ two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm10B + JMP repeat_end_emit_encodeBetterBlockAsm10B -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeSnappyBlockAsm10B: +repeat_end_emit_encodeBetterBlockAsm10B: MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm10B + JMP search_loop_encodeBetterBlockAsm10B -no_repeat_found_encodeSnappyBlockAsm10B: +no_repeat_found_encodeBetterBlockAsm10B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeSnappyBlockAsm10B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 + JEQ candidate_match_encodeBetterBlockAsm10B CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm10B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm10B + JEQ candidateS_match_encodeBetterBlockAsm10B MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm10B - -candidate3_match_encodeSnappyBlockAsm10B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm10B + JMP search_loop_encodeBetterBlockAsm10B -candidate2_match_encodeSnappyBlockAsm10B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BP +candidateS_match_encodeBetterBlockAsm10B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm10B + DECL CX + MOVL DI, BP -candidate_match_encodeSnappyBlockAsm10B: +candidate_match_encodeBetterBlockAsm10B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeSnappyBlockAsm10B + JZ match_extend_back_end_encodeBetterBlockAsm10B + +match_extend_back_loop_encodeBetterBlockAsm10B: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm10B + JMP match_extend_back_loop_encodeBetterBlockAsm10B + +match_extend_back_end_encodeBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm10B: + MOVL CX, SI + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm10B + +matchlen_loop_match_nolit_encodeBetterBlockAsm10B: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B + +matchlen_single_match_nolit_encodeBetterBlockAsm10B: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm10B -match_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL CX, SI - JLE match_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(BP*1), BL - MOVB -1(DX)(CX*1), DI - CMPB BL, DI - JNE match_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(CX), CX - DECL BP - JZ match_extend_back_end_encodeSnappyBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBlockAsm10B +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm10B + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B -match_extend_back_end_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI - CMPQ SI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET +match_nolit_end_encodeBetterBlockAsm10B: + MOVL CX, DI + SUBL BP, DI + CMPL R11, $0x01 + JG match_length_ok_encodeBetterBlockAsm10B + CMPL DI, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm10B + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm10B -match_dst_size_check_encodeSnappyBlockAsm10B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B - MOVL SI, R8 +match_length_ok_encodeBetterBlockAsm10B: + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B + MOVL SI, DI MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm10B - CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm10B + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm10B + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm10B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B + JMP memmove_long_match_emit_encodeBetterBlockAsm10B -two_bytes_match_emit_encodeSnappyBlockAsm10B: +two_bytes_match_emit_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB BP, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_encodeBetterBlockAsm10B -one_byte_match_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, DI - MOVB DI, (AX) +one_byte_match_emit_encodeBetterBlockAsm10B: + SHLB $0x02, BP + MOVB BP, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), DI - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 +memmove_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(DI*1), BP -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 +emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B +memmove_end_copy_match_emit_encodeBetterBlockAsm10B: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B -memmove_long_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), DI - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 +memmove_long_match_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 SHRQ $0x07, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 + MOVQ $0x00000040, R12 + SUBQ R9, R12 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -6122,236 +6898,251 @@ emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU 64(R9), X8 MOVOU 80(R9), X9 MOVOU 96(R9), X10 - MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm10B: -match_nolit_loop_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP - XORL R9, R9 - CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B - -matchlen_single_match_nolit_encodeSnappyBlockAsm10B: - TESTL SI, SI - JZ match_nolit_end_encodeSnappyBlockAsm10B + MOVOU 112(R9), X11 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 + ADDQ $0x80, R9 + ADDQ $0x80, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm10B - LEAL 1(R9), R9 - DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX -match_nolit_end_encodeSnappyBlockAsm10B: - ADDL R9, CX +emit_literal_done_match_emit_encodeBetterBlockAsm10B: + ADDL R11, CX MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL $0x04, R11 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: - CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm10B: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(R9), R9 + LEAL -60(R11), R11 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: - CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B + // emitRepeat + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 + LEAL -16(BX)(R11*4), R11 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + ORL BP, R11 + MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B -emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: +emit_copy_three_match_nolit_encodeBetterBlockAsm10B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: +match_nolit_emitcopy_end_encodeBetterBlockAsm10B: CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm10B - MOVQ -2(DX)(CX*1), SI + JGE emit_remainder_encodeBetterBlockAsm10B CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm10B + JL match_nolit_dst_ok_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeSnappyBlockAsm10B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x36, DI - SHLQ $0x20, BP - IMULQ R8, BP - SHRQ $0x36, BP +match_nolit_dst_ok_encodeBetterBlockAsm10B: + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x36, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 16408(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm10B - INCL CX - JMP search_loop_encodeSnappyBlockAsm10B + LEAL -1(CX), SI + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x34, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x36, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 16408(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm10B -emit_remainder_encodeSnappyBlockAsm10B: +emit_remainder_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm10B + JL emit_remainder_ok_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeSnappyBlockAsm10B: +emit_remainder_ok_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B + JLT one_byte_emit_remainder_encodeBetterBlockAsm10B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B + JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B -two_bytes_emit_remainder_encodeSnappyBlockAsm10B: +two_bytes_emit_remainder_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B + JL memmove_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B -one_byte_emit_remainder_encodeSnappyBlockAsm10B: +one_byte_emit_remainder_encodeBetterBlockAsm10B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeSnappyBlockAsm10B: +memmove_emit_remainder_encodeBetterBlockAsm10B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -6361,13 +7152,15 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: +memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B -memmove_long_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeBetterBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -6379,11 +7172,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm10B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -6404,37 +7197,37 @@ emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: +emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int +// func encodeBetterBlockAsm8B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 +TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX + MOVQ $0x00000028, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeSnappyBlockAsm8B: +zero_loop_encodeBetterBlockAsm8B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -6445,7 +7238,7 @@ zero_loop_encodeSnappyBlockAsm8B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm8B + JNZ zero_loop_encodeBetterBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -6459,264 +7252,348 @@ zero_loop_encodeSnappyBlockAsm8B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeSnappyBlockAsm8B: +search_loop_encodeBetterBlockAsm8B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x04, BP - LEAL 4(CX)(BP*1), BP + LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm8B + JGE emit_remainder_encodeBetterBlockAsm8B MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ $0x9e3779b1, BP MOVQ SI, R9 MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x38, R9 + SHRQ $0x36, R9 SHLQ $0x20, R10 - IMULQ R8, R10 + IMULQ BP, R10 SHRQ $0x38, R10 MOVL 24(SP)(R9*4), BP - MOVL 24(SP)(R10*4), DI + MOVL 4120(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) + MOVL CX, 4120(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm8B + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm8B LEAL 1(CX), SI - MOVL 12(SP), BP - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm8B + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm8B -repeat_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL SI, BP - JLE repeat_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(DI*1), BL +repeat_extend_back_loop_encodeBetterBlockAsm8B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeSnappyBlockAsm8B + JNE repeat_extend_back_end_encodeBetterBlockAsm8B LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm8B -repeat_extend_back_end_encodeSnappyBlockAsm8B: +repeat_extend_back_end_encodeBetterBlockAsm8B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - MOVL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + MOVL SI, R8 MOVL SI, 12(SP) - LEAQ (DX)(BP*1), R8 - SUBL BP, DI - LEAL -1(DI), BP + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B + JLT one_byte_repeat_emit_encodeBetterBlockAsm8B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B + JLT two_bytes_repeat_emit_encodeBetterBlockAsm8B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B -two_bytes_repeat_emit_encodeSnappyBlockAsm8B: +two_bytes_repeat_emit_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeSnappyBlockAsm8B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B + JL memmove_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B -one_byte_repeat_emit_encodeSnappyBlockAsm8B: +one_byte_repeat_emit_encodeBetterBlockAsm8B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(DI*1), BP - CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3 - CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 +memmove_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (R8), R9 - MOVB -1(R8)(DI*1), R8 - MOVB R9, (AX) - MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (R8), R9 - MOVB 2(R8), R8 - MOVW R9, (AX) - MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: +memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm8B -memmove_long_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(DI*1), BP - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x07, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 +memmove_long_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), BP -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x07, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOU 32(R10), X6 + MOVOU 48(R10), X7 + MOVOU 64(R10), X8 + MOVOU 80(R10), X9 + MOVOU 96(R10), X10 + MOVOU 112(R10), X11 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 + ADDQ $0x80, R10 ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: - ADDL $0x05, CX - MOVL CX, BP - SUBL 16(SP), BP - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BP*1), BP - XORL R10, R10 - CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B +emit_literal_done_repeat_emit_encodeBetterBlockAsm8B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm8B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVQ (R8)(R10*1), R9 - XORQ (BP)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B +matchlen_single_repeat_extend_encodeBetterBlockAsm8B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm8B -matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm8B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B -matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: +repeat_extend_forward_end_encodeBetterBlockAsm8B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI TESTL DI, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B + JZ repeat_as_copy_encodeBetterBlockAsm8B -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVB (R8)(R10*1), R9 - CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B - LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm8B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B -repeat_extend_forward_end_encodeSnappyBlockAsm8B: - ADDL R10, CX - MOVL CX, BP - SUBL SI, BP - MOVL 16(SP), SI +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm8B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: +repeat_three_match_repeat_encodeBetterBlockAsm8B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_two_match_repeat_encodeBetterBlockAsm8B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_as_copy_encodeBetterBlockAsm8B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_three_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_two_repeat_as_copy_encodeBetterBlockAsm8B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -6725,181 +7602,236 @@ two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm8B + JMP repeat_end_emit_encodeBetterBlockAsm8B -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeSnappyBlockAsm8B: +repeat_end_emit_encodeBetterBlockAsm8B: MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm8B + JMP search_loop_encodeBetterBlockAsm8B -no_repeat_found_encodeSnappyBlockAsm8B: +no_repeat_found_encodeBetterBlockAsm8B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeSnappyBlockAsm8B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BP - LEAL 2(CX), R8 + JEQ candidate_match_encodeBetterBlockAsm8B CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm8B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm8B + JEQ candidateS_match_encodeBetterBlockAsm8B MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm8B - -candidate3_match_encodeSnappyBlockAsm8B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm8B + JMP search_loop_encodeBetterBlockAsm8B -candidate2_match_encodeSnappyBlockAsm8B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BP +candidateS_match_encodeBetterBlockAsm8B: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm8B + DECL CX + MOVL DI, BP -candidate_match_encodeSnappyBlockAsm8B: +candidate_match_encodeBetterBlockAsm8B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeSnappyBlockAsm8B + JZ match_extend_back_end_encodeBetterBlockAsm8B -match_extend_back_loop_encodeSnappyBlockAsm8B: +match_extend_back_loop_encodeBetterBlockAsm8B: CMPL CX, SI - JLE match_extend_back_end_encodeSnappyBlockAsm8B + JLE match_extend_back_end_encodeBetterBlockAsm8B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeSnappyBlockAsm8B + JNE match_extend_back_end_encodeBetterBlockAsm8B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeSnappyBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBlockAsm8B + JZ match_extend_back_end_encodeBetterBlockAsm8B + JMP match_extend_back_loop_encodeBetterBlockAsm8B -match_extend_back_end_encodeSnappyBlockAsm8B: +match_extend_back_end_encodeBetterBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeSnappyBlockAsm8B + JL match_dst_size_check_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeSnappyBlockAsm8B: +match_dst_size_check_encodeBetterBlockAsm8B: MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B - MOVL SI, R8 + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + +matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm8B + +matchlen_loop_match_nolit_encodeBetterBlockAsm8B: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B + +matchlen_single_match_nolit_encodeBetterBlockAsm8B: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm8B + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B + +match_nolit_end_encodeBetterBlockAsm8B: + MOVL CX, DI + SUBL BP, DI + CMPL R11, $0x01 + JG match_length_ok_encodeBetterBlockAsm8B + CMPL DI, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm8B + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm8B + +match_length_ok_encodeBetterBlockAsm8B: + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B + MOVL SI, DI MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JLT one_byte_match_emit_encodeSnappyBlockAsm8B - CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeSnappyBlockAsm8B + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm8B + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm8B MOVB $0xf4, (AX) - MOVW DI, 1(AX) + MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B + JMP memmove_long_match_emit_encodeBetterBlockAsm8B -two_bytes_match_emit_encodeSnappyBlockAsm8B: +two_bytes_match_emit_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) - MOVB DI, 1(AX) + MOVB BP, 1(AX) ADDQ $0x02, AX - CMPL DI, $0x40 - JL memmove_match_emit_encodeSnappyBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_encodeBetterBlockAsm8B -one_byte_match_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, DI - MOVB DI, (AX) +one_byte_match_emit_encodeBetterBlockAsm8B: + SHLB $0x02, BP + MOVB BP, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), DI - CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 +memmove_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(DI*1), BP -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (SI), R9 - MOVB -1(SI)(R8*1), SI + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) - MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (SI), R9 - MOVB 2(SI), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 MOVW R9, (AX) - MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7: - MOVL (SI), R9 - MOVL -4(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) - MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 +emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B +memmove_end_copy_match_emit_encodeBetterBlockAsm8B: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B -memmove_long_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), DI - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 +memmove_long_match_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 SHRQ $0x07, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 + MOVQ $0x00000040, R12 + SUBQ R9, R12 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -6908,233 +7840,244 @@ emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU 80(R9), X9 MOVOU 96(R9), X10 MOVOU 112(R9), X11 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + MOVOA X6, 32(R13) + MOVOA X7, 48(R13) + MOVOA X8, 64(R13) + MOVOA X9, 80(R13) + MOVOA X10, 96(R13) + MOVOA X11, 112(R13) + ADDQ $0x80, R13 ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x80, R12 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm8B: -match_nolit_loop_encodeSnappyBlockAsm8B: - MOVL CX, SI - SUBL BP, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BP - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), BP - XORL R9, R9 - CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B - -matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVQ (DI)(R9*1), R8 - XORQ (BP)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - BSFQ R8, R8 - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B - -matchlen_single_match_nolit_encodeSnappyBlockAsm8B: - TESTL SI, SI - JZ match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVB (DI)(R9*1), R8 - CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm8B - LEAL 1(R9), R9 - DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX -match_nolit_end_encodeSnappyBlockAsm8B: - ADDL R9, CX +emit_literal_done_match_emit_encodeBetterBlockAsm8B: + ADDL R11, CX MOVL 16(SP), BP - ADDL $0x04, R9 + ADDL $0x04, R11 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: - CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B + // emitCopy +two_byte_offset_match_nolit_encodeBetterBlockAsm8B: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(R9), R9 + LEAL -60(R11), R11 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: - CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B + // emitRepeat + MOVL R11, BP + LEAL -4(R11), R11 + CMPL BP, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + CMPL BP, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B MOVB $0x01, BL - LEAL -16(BX)(R9*4), R9 + LEAL -16(BX)(R11*4), R11 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, R9 - MOVB R9, (AX) + ORL BP, R11 + MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B -emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: +emit_copy_three_match_nolit_encodeBetterBlockAsm8B: MOVB $0x02, BL - LEAL -4(BX)(R9*4), R9 - MOVB R9, (AX) + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: +match_nolit_emitcopy_end_encodeBetterBlockAsm8B: CMPL CX, 8(SP) - JGE emit_remainder_encodeSnappyBlockAsm8B - MOVQ -2(DX)(CX*1), SI + JGE emit_remainder_encodeBetterBlockAsm8B CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeSnappyBlockAsm8B + JL match_nolit_dst_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeSnappyBlockAsm8B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BP - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x38, DI - SHLQ $0x20, BP - IMULQ R8, BP - SHRQ $0x38, BP +match_nolit_dst_ok_encodeBetterBlockAsm8B: + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x38, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 4120(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm8B - INCL CX - JMP search_loop_encodeSnappyBlockAsm8B + LEAL -1(CX), SI + SHLQ $0x10, R9 + IMULQ BP, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x38, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 4120(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm8B -emit_remainder_encodeSnappyBlockAsm8B: +emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeSnappyBlockAsm8B + JL emit_remainder_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeSnappyBlockAsm8B: +emit_remainder_ok_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B + JLT one_byte_emit_remainder_encodeBetterBlockAsm8B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B + JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B -two_bytes_emit_remainder_encodeSnappyBlockAsm8B: +two_bytes_emit_remainder_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeSnappyBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B + JL memmove_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B -one_byte_emit_remainder_encodeSnappyBlockAsm8B: +one_byte_emit_remainder_encodeBetterBlockAsm8B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeSnappyBlockAsm8B: +memmove_emit_remainder_encodeBetterBlockAsm8B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -7144,13 +8087,15 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: +memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B -memmove_long_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeBetterBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -7162,11 +8107,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm8B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -7187,37 +8132,37 @@ emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: +emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBetterBlockAsm(dst []byte, src []byte) int +// func encodeSnappyBlockAsm(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 +TEXT ·encodeSnappyBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00000200, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBetterBlockAsm: +zero_loop_encodeSnappyBlockAsm: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -7228,7 +8173,7 @@ zero_loop_encodeBetterBlockAsm: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBetterBlockAsm + JNZ zero_loop_encodeSnappyBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -7242,152 +8187,160 @@ zero_loop_encodeBetterBlockAsm: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBetterBlockAsm: +search_loop_encodeSnappyBlockAsm: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x07, BP - LEAL 1(CX)(BP*1), BP + SHRL $0x06, BP + LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm + JGE emit_remainder_encodeSnappyBlockAsm MOVL BP, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R8 - MOVQ $0x9e3779b1, BP + MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 - SHLQ $0x08, R9 + SHRQ $0x08, R10 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ BP, R10 + SHRQ $0x32, R9 + SHLQ $0x10, R10 + IMULQ R8, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BP - MOVL 262168(SP)(R10*4), DI + MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - MOVL CX, 262168(SP)(R10*4) - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R10 + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 - SHRQ $0x08, R9 - CMPL R9, R10 - JNE no_repeat_found_encodeBetterBlockAsm + SHRQ $0x10, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 + JNE no_repeat_found_encodeSnappyBlockAsm LEAL 1(CX), SI MOVL 12(SP), BP MOVL SI, DI SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeBetterBlockAsm + JZ repeat_extend_back_end_encodeSnappyBlockAsm -repeat_extend_back_loop_encodeBetterBlockAsm: +repeat_extend_back_loop_encodeSnappyBlockAsm: CMPL SI, BP - JLE repeat_extend_back_end_encodeBetterBlockAsm + JLE repeat_extend_back_end_encodeSnappyBlockAsm MOVB -1(DX)(DI*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBetterBlockAsm + JNE repeat_extend_back_end_encodeSnappyBlockAsm LEAL -1(SI), SI DECL DI - JNZ repeat_extend_back_loop_encodeBetterBlockAsm + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm -repeat_extend_back_end_encodeBetterBlockAsm: +repeat_extend_back_end_encodeSnappyBlockAsm: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BP*1), R8 SUBL BP, DI LEAL -1(DI), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm + JLT one_byte_repeat_emit_encodeSnappyBlockAsm CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm CMPL BP, $0x00010000 - JLT three_bytes_repeat_emit_encodeBetterBlockAsm + JLT three_bytes_repeat_emit_encodeSnappyBlockAsm CMPL BP, $0x01000000 - JLT four_bytes_repeat_emit_encodeBetterBlockAsm + JLT four_bytes_repeat_emit_encodeSnappyBlockAsm MOVB $0xfc, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm -four_bytes_repeat_emit_encodeBetterBlockAsm: +four_bytes_repeat_emit_encodeSnappyBlockAsm: MOVL BP, R9 SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm -three_bytes_repeat_emit_encodeBetterBlockAsm: +three_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm -two_bytes_repeat_emit_encodeBetterBlockAsm: +two_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm - JMP memmove_long_repeat_emit_encodeBetterBlockAsm + JL memmove_repeat_emit_encodeSnappyBlockAsm + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm -one_byte_repeat_emit_encodeBetterBlockAsm: +one_byte_repeat_emit_encodeSnappyBlockAsm: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBetterBlockAsm: +memmove_repeat_emit_encodeSnappyBlockAsm: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3 CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2: MOVB (R8), R9 MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3: MOVW (R8), R9 MOVB 2(R8), R8 MOVW R9, (AX) MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7: MOVL (R8), R9 MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -7397,12 +8350,14 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_repeat_emit_encodeBetterBlockAsm: +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm -memmove_long_repeat_emit_encodeBetterBlockAsm: - LEAQ (AX)(DI*1), BP +memmove_long_repeat_emit_encodeSnappyBlockAsm: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -7414,11 +8369,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -7439,23 +8394,23 @@ emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBetterBlockAsm: +emit_literal_done_repeat_emit_encodeSnappyBlockAsm: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -7463,81 +8418,85 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeBetterBlockAsm + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm -matchlen_loopback_repeat_extend_encodeBetterBlockAsm: +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: MOVQ (R8)(R10*1), R9 XORQ (BP)(R10*1), R9 TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm BSFQ R9, R9 SARQ $0x03, R9 LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeBetterBlockAsm + JMP repeat_extend_forward_end_encodeSnappyBlockAsm -matchlen_loop_repeat_extend_encodeBetterBlockAsm: +matchlen_loop_repeat_extend_encodeSnappyBlockAsm: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm -matchlen_single_repeat_extend_encodeBetterBlockAsm: +matchlen_single_repeat_extend_encodeSnappyBlockAsm: TESTL DI, DI - JZ repeat_extend_forward_end_encodeBetterBlockAsm + JZ repeat_extend_forward_end_encodeSnappyBlockAsm -matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm: +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: MOVB (R8)(R10*1), R9 CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeBetterBlockAsm + JNE repeat_extend_forward_end_encodeSnappyBlockAsm LEAL 1(R10), R10 DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm -repeat_extend_forward_end_encodeBetterBlockAsm: +repeat_extend_forward_end_encodeSnappyBlockAsm: ADDL R10, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI + + // emitCopy CMPL SI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm -four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm: +four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: CMPL BP, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xff, (AX) MOVL SI, 1(AX) LEAL -64(BP), BP ADDQ $0x05, AX CMPL BP, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm + JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm -four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm: +four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: TESTL BP, BP - JZ repeat_end_emit_encodeBetterBlockAsm + JZ repeat_end_emit_encodeSnappyBlockAsm MOVB $0x03, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVL SI, 1(AX) ADDQ $0x05, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeSnappyBlockAsm -two_byte_offset_repeat_as_copy_encodeBetterBlockAsm: +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm -two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -7546,167 +8505,171 @@ two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeSnappyBlockAsm -emit_copy_three_repeat_as_copy_encodeBetterBlockAsm: +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBetterBlockAsm: +repeat_end_emit_encodeSnappyBlockAsm: MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm + JMP search_loop_encodeSnappyBlockAsm -no_repeat_found_encodeBetterBlockAsm: +no_repeat_found_encodeSnappyBlockAsm: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm + JEQ candidate_match_encodeSnappyBlockAsm + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidateS_match_encodeBetterBlockAsm + JEQ candidate2_match_encodeSnappyBlockAsm + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI + JEQ candidate3_match_encodeSnappyBlockAsm MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm + JMP search_loop_encodeSnappyBlockAsm -candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x30, R9 - MOVL 24(SP)(R9*4), BP - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm - DECL CX - MOVL DI, BP +candidate3_match_encodeSnappyBlockAsm: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm -candidate_match_encodeBetterBlockAsm: +candidate2_match_encodeSnappyBlockAsm: + MOVL R8, 24(SP)(R9*4) + INCL CX + MOVL DI, BP + +candidate_match_encodeSnappyBlockAsm: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBetterBlockAsm + JZ match_extend_back_end_encodeSnappyBlockAsm -match_extend_back_loop_encodeBetterBlockAsm: +match_extend_back_loop_encodeSnappyBlockAsm: CMPL CX, SI - JLE match_extend_back_end_encodeBetterBlockAsm + JLE match_extend_back_end_encodeSnappyBlockAsm MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBetterBlockAsm + JNE match_extend_back_end_encodeSnappyBlockAsm LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBetterBlockAsm - JMP match_extend_back_loop_encodeBetterBlockAsm + JZ match_extend_back_end_encodeSnappyBlockAsm + JMP match_extend_back_loop_encodeSnappyBlockAsm -match_extend_back_end_encodeBetterBlockAsm: +match_extend_back_end_encodeSnappyBlockAsm: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm + JL match_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBetterBlockAsm: +match_dst_size_check_encodeSnappyBlockAsm: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm + JLT one_byte_match_emit_encodeSnappyBlockAsm CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm + JLT two_bytes_match_emit_encodeSnappyBlockAsm CMPL DI, $0x00010000 - JLT three_bytes_match_emit_encodeBetterBlockAsm + JLT three_bytes_match_emit_encodeSnappyBlockAsm CMPL DI, $0x01000000 - JLT four_bytes_match_emit_encodeBetterBlockAsm + JLT four_bytes_match_emit_encodeSnappyBlockAsm MOVB $0xfc, (AX) MOVL DI, 1(AX) ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeSnappyBlockAsm -four_bytes_match_emit_encodeBetterBlockAsm: +four_bytes_match_emit_encodeSnappyBlockAsm: MOVL DI, R9 SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeSnappyBlockAsm -three_bytes_match_emit_encodeBetterBlockAsm: +three_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeSnappyBlockAsm -two_bytes_match_emit_encodeBetterBlockAsm: +two_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm - JMP memmove_long_match_emit_encodeBetterBlockAsm + JL memmove_match_emit_encodeSnappyBlockAsm + JMP memmove_long_match_emit_encodeSnappyBlockAsm -one_byte_match_emit_encodeBetterBlockAsm: +one_byte_match_emit_encodeSnappyBlockAsm: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBetterBlockAsm: +memmove_match_emit_encodeSnappyBlockAsm: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -7716,12 +8679,14 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBetterBlockAsm: +memmove_end_copy_match_emit_encodeSnappyBlockAsm: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm -memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), DI +memmove_long_match_emit_encodeSnappyBlockAsm: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -7733,11 +8698,11 @@ memmove_long_match_emit_encodeBetterBlockAsm: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -7758,23 +8723,24 @@ emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBetterBlockAsm: +emit_literal_done_match_emit_encodeSnappyBlockAsm: +match_nolit_loop_encodeSnappyBlockAsm: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -7783,250 +8749,245 @@ emit_literal_done_match_emit_encodeBetterBlockAsm: MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), R8 - XORL R10, R10 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm + JL matchlen_single_match_nolit_encodeSnappyBlockAsm -matchlen_loopback_match_nolit_encodeBetterBlockAsm: - MOVQ (DI)(R10*1), R9 - XORQ (R8)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBetterBlockAsm +matchlen_loopback_match_nolit_encodeSnappyBlockAsm: + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm -matchlen_loop_match_nolit_encodeBetterBlockAsm: +matchlen_loop_match_nolit_encodeSnappyBlockAsm: LEAL -8(SI), SI - LEAL 8(R10), R10 + LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm -matchlen_single_match_nolit_encodeBetterBlockAsm: +matchlen_single_match_nolit_encodeSnappyBlockAsm: TESTL SI, SI - JZ match_nolit_end_encodeBetterBlockAsm + JZ match_nolit_end_encodeSnappyBlockAsm -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: - MOVB (DI)(R10*1), R9 - CMPB (R8)(R10*1), R9 - JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R10), R10 +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 + JNE match_nolit_end_encodeSnappyBlockAsm + LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm -match_nolit_end_encodeBetterBlockAsm: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 +match_nolit_end_encodeSnappyBlockAsm: + ADDL R9, CX + MOVL 16(SP), BP + ADDL $0x04, R9 MOVL CX, 12(SP) - CMPL SI, $0x00010000 - JL two_byte_offset_match_nolit_encodeBetterBlockAsm -four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: - CMPL R10, $0x40 - JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm + // emitCopy + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeSnappyBlockAsm + +four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: + CMPL R9, $0x40 + JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(R10), R10 + MOVL BP, 1(AX) + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL R10, $0x04 - JL four_bytes_remain_match_nolit_encodeBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + CMPL R9, $0x04 + JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm -four_bytes_remain_match_nolit_encodeBetterBlockAsm: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm +four_bytes_remain_match_nolit_encodeSnappyBlockAsm: + TESTL R9, R9 + JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm MOVB $0x03, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVL SI, 1(AX) + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVL BP, 1(AX) ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm -two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm +two_byte_offset_match_nolit_encodeSnappyBlockAsm: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 + MOVW BP, 1(AX) + LEAL -60(R9), R9 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm -two_byte_offset_short_match_nolit_encodeBetterBlockAsm: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: + CMPL R9, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) + LEAL -16(BX)(R9*4), R9 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm -emit_copy_three_match_nolit_encodeBetterBlockAsm: +emit_copy_three_match_nolit_encodeSnappyBlockAsm: MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBetterBlockAsm: +match_nolit_emitcopy_end_encodeSnappyBlockAsm: CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm - MOVQ -3(DX)(BP*1), SI + JGE emit_remainder_encodeSnappyBlockAsm + MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm + JL match_nolit_dst_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, DI - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -3(BP), R11 - LEAL -2(BP), BP - MOVQ -2(DX)(CX*1), SI - SHLQ $0x08, R9 - IMULQ DI, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x32, R10 - MOVL R11, 24(SP)(R9*4) - MOVL BP, 262168(SP)(R10*4) - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -2(CX), SI - LEAL -1(CX), BP - SHLQ $0x08, R9 - IMULQ DI, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x32, R10 - MOVL SI, 24(SP)(R9*4) - MOVL BP, 262168(SP)(R10*4) - JMP search_loop_encodeBetterBlockAsm +match_nolit_dst_ok_encodeSnappyBlockAsm: + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, BP + SHLQ $0x10, DI + IMULQ R8, DI + SHRQ $0x32, DI + SHLQ $0x10, BP + IMULQ R8, BP + SHRQ $0x32, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeSnappyBlockAsm + INCL CX + JMP search_loop_encodeSnappyBlockAsm -emit_remainder_encodeBetterBlockAsm: +emit_remainder_encodeSnappyBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm + JL emit_remainder_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBetterBlockAsm: +emit_remainder_ok_encodeSnappyBlockAsm: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm + JLT one_byte_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBetterBlockAsm + JLT three_bytes_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeBetterBlockAsm + JLT four_bytes_emit_remainder_encodeSnappyBlockAsm MOVB $0xfc, (AX) MOVL DX, 1(AX) ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm -four_bytes_emit_remainder_encodeBetterBlockAsm: +four_bytes_emit_remainder_encodeSnappyBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm -three_bytes_emit_remainder_encodeBetterBlockAsm: +three_bytes_emit_remainder_encodeSnappyBlockAsm: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm -two_bytes_emit_remainder_encodeBetterBlockAsm: +two_bytes_emit_remainder_encodeSnappyBlockAsm: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm - JMP memmove_long_emit_remainder_encodeBetterBlockAsm + JL memmove_emit_remainder_encodeSnappyBlockAsm + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm -one_byte_emit_remainder_encodeBetterBlockAsm: +one_byte_emit_remainder_encodeSnappyBlockAsm: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBetterBlockAsm: +memmove_emit_remainder_encodeSnappyBlockAsm: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -8036,13 +8997,15 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBetterBlockAsm: +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm -memmove_long_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeSnappyBlockAsm: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -8054,11 +9017,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -8079,37 +9042,37 @@ emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBetterBlockAsm: +emit_literal_done_emit_remainder_encodeSnappyBlockAsm: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBetterBlockAsm12B(dst []byte, src []byte) int +// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 +TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000280, CX + MOVQ $0x00000080, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBetterBlockAsm12B: +zero_loop_encodeSnappyBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -8120,7 +9083,7 @@ zero_loop_encodeBetterBlockAsm12B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBetterBlockAsm12B + JNZ zero_loop_encodeSnappyBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -8134,133 +9097,141 @@ zero_loop_encodeBetterBlockAsm12B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBetterBlockAsm12B: +search_loop_encodeSnappyBlockAsm12B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x06, BP - LEAL 1(CX)(BP*1), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm12B + JGE emit_remainder_encodeSnappyBlockAsm12B MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BP + MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, R9 MOVQ SI, R10 - SHLQ $0x10, R9 + SHRQ $0x08, R10 + SHLQ $0x18, R9 IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ BP, R10 + SHRQ $0x34, R9 + SHLQ $0x18, R10 + IMULQ R8, R10 SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BP - MOVL 65560(SP)(R10*4), DI + MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - MOVL CX, 65560(SP)(R10*4) - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R10 + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 - SHRQ $0x08, R9 - CMPL R9, R10 - JNE no_repeat_found_encodeBetterBlockAsm12B + SHRQ $0x10, R9 + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 + JNE no_repeat_found_encodeSnappyBlockAsm12B LEAL 1(CX), SI MOVL 12(SP), BP MOVL SI, DI SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeBetterBlockAsm12B + JZ repeat_extend_back_end_encodeSnappyBlockAsm12B -repeat_extend_back_loop_encodeBetterBlockAsm12B: +repeat_extend_back_loop_encodeSnappyBlockAsm12B: CMPL SI, BP - JLE repeat_extend_back_end_encodeBetterBlockAsm12B + JLE repeat_extend_back_end_encodeSnappyBlockAsm12B MOVB -1(DX)(DI*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBetterBlockAsm12B + JNE repeat_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(SI), SI DECL DI - JNZ repeat_extend_back_loop_encodeBetterBlockAsm12B + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B -repeat_extend_back_end_encodeBetterBlockAsm12B: +repeat_extend_back_end_encodeSnappyBlockAsm12B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BP*1), R8 SUBL BP, DI LEAL -1(DI), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm12B + JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm12B + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B -two_bytes_repeat_emit_encodeBetterBlockAsm12B: +two_bytes_repeat_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm12B - JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + JL memmove_repeat_emit_encodeSnappyBlockAsm12B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B -one_byte_repeat_emit_encodeBetterBlockAsm12B: +one_byte_repeat_emit_encodeSnappyBlockAsm12B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBetterBlockAsm12B: +memmove_repeat_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: MOVB (R8), R9 MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3: MOVW (R8), R9 MOVB 2(R8), R8 MOVW R9, (AX) MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7: MOVL (R8), R9 MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -8270,12 +9241,14 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B: +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B -memmove_long_repeat_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(DI*1), BP +memmove_long_repeat_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -8287,11 +9260,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm12B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -8312,23 +9285,23 @@ emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBetterBlockAsm12B: +emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -8336,58 +9309,61 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm12B: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeBetterBlockAsm12B + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B -matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B: +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: MOVQ (R8)(R10*1), R9 XORQ (BP)(R10*1), R9 TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm12B + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B BSFQ R9, R9 SARQ $0x03, R9 LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeBetterBlockAsm12B + JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B -matchlen_loop_repeat_extend_encodeBetterBlockAsm12B: +matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B -matchlen_single_repeat_extend_encodeBetterBlockAsm12B: +matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: TESTL DI, DI - JZ repeat_extend_forward_end_encodeBetterBlockAsm12B + JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B -matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B: +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: MOVB (R8)(R10*1), R9 CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeBetterBlockAsm12B + JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B LEAL 1(R10), R10 DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm12B + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B -repeat_extend_forward_end_encodeBetterBlockAsm12B: +repeat_extend_forward_end_encodeSnappyBlockAsm12B: ADDL R10, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI -two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B -two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B: +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -8396,148 +9372,152 @@ two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm12B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm12B + JMP repeat_end_emit_encodeSnappyBlockAsm12B -emit_copy_three_repeat_as_copy_encodeBetterBlockAsm12B: +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBetterBlockAsm12B: +repeat_end_emit_encodeSnappyBlockAsm12B: MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm12B + JMP search_loop_encodeSnappyBlockAsm12B -no_repeat_found_encodeBetterBlockAsm12B: +no_repeat_found_encodeSnappyBlockAsm12B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm12B + JEQ candidate_match_encodeSnappyBlockAsm12B + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidateS_match_encodeBetterBlockAsm12B + JEQ candidate2_match_encodeSnappyBlockAsm12B + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI + JEQ candidate3_match_encodeSnappyBlockAsm12B MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B + JMP search_loop_encodeSnappyBlockAsm12B -candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL 24(SP)(R9*4), BP - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm12B - DECL CX - MOVL DI, BP +candidate3_match_encodeSnappyBlockAsm12B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm12B -candidate_match_encodeBetterBlockAsm12B: +candidate2_match_encodeSnappyBlockAsm12B: + MOVL R8, 24(SP)(R9*4) + INCL CX + MOVL DI, BP + +candidate_match_encodeSnappyBlockAsm12B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBetterBlockAsm12B + JZ match_extend_back_end_encodeSnappyBlockAsm12B -match_extend_back_loop_encodeBetterBlockAsm12B: +match_extend_back_loop_encodeSnappyBlockAsm12B: CMPL CX, SI - JLE match_extend_back_end_encodeBetterBlockAsm12B + JLE match_extend_back_end_encodeSnappyBlockAsm12B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBetterBlockAsm12B + JNE match_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBetterBlockAsm12B - JMP match_extend_back_loop_encodeBetterBlockAsm12B + JZ match_extend_back_end_encodeSnappyBlockAsm12B + JMP match_extend_back_loop_encodeSnappyBlockAsm12B -match_extend_back_end_encodeBetterBlockAsm12B: +match_extend_back_end_encodeSnappyBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm12B + JL match_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBetterBlockAsm12B: +match_dst_size_check_encodeSnappyBlockAsm12B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm12B + JLT one_byte_match_emit_encodeSnappyBlockAsm12B CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm12B + JLT two_bytes_match_emit_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm12B + JMP memmove_long_match_emit_encodeSnappyBlockAsm12B -two_bytes_match_emit_encodeBetterBlockAsm12B: +two_bytes_match_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_encodeBetterBlockAsm12B + JL memmove_match_emit_encodeSnappyBlockAsm12B + JMP memmove_long_match_emit_encodeSnappyBlockAsm12B -one_byte_match_emit_encodeBetterBlockAsm12B: +one_byte_match_emit_encodeSnappyBlockAsm12B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBetterBlockAsm12B: +memmove_match_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -8547,12 +9527,14 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBetterBlockAsm12B: +memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B -memmove_long_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), DI +memmove_long_match_emit_encodeSnappyBlockAsm12B: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -8564,11 +9546,11 @@ memmove_long_match_emit_encodeBetterBlockAsm12B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -8589,23 +9571,24 @@ emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBetterBlockAsm12B: +emit_literal_done_match_emit_encodeSnappyBlockAsm12B: +match_nolit_loop_encodeSnappyBlockAsm12B: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -8614,208 +9597,202 @@ emit_literal_done_match_emit_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), R8 - XORL R10, R10 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B -matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVQ (DI)(R10*1), R9 - XORQ (R8)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBetterBlockAsm12B +matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm12B -matchlen_loop_match_nolit_encodeBetterBlockAsm12B: +matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: LEAL -8(SI), SI - LEAL 8(R10), R10 + LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B -matchlen_single_match_nolit_encodeBetterBlockAsm12B: +matchlen_single_match_nolit_encodeSnappyBlockAsm12B: TESTL SI, SI - JZ match_nolit_end_encodeBetterBlockAsm12B + JZ match_nolit_end_encodeSnappyBlockAsm12B -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVB (DI)(R10*1), R9 - CMPB (R8)(R10*1), R9 - JNE match_nolit_end_encodeBetterBlockAsm12B - LEAL 1(R10), R10 +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 + JNE match_nolit_end_encodeSnappyBlockAsm12B + LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B -match_nolit_end_encodeBetterBlockAsm12B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 +match_nolit_end_encodeSnappyBlockAsm12B: + ADDL R9, CX + MOVL 16(SP), BP + ADDL $0x04, R9 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeBetterBlockAsm12B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 + MOVW BP, 1(AX) + LEAL -60(R9), R9 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B -two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: + CMPL R9, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) + LEAL -16(BX)(R9*4), R9 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B -emit_copy_three_match_nolit_encodeBetterBlockAsm12B: +emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBetterBlockAsm12B: +match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm12B - MOVQ -3(DX)(BP*1), SI + JGE emit_remainder_encodeSnappyBlockAsm12B + MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm12B + JL match_nolit_dst_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, DI - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -3(BP), R11 - LEAL -2(BP), BP - MOVQ -2(DX)(CX*1), SI - SHLQ $0x10, R9 - IMULQ DI, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x34, R10 - MOVL R11, 24(SP)(R9*4) - MOVL BP, 65560(SP)(R10*4) - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -2(CX), SI - LEAL -1(CX), BP - SHLQ $0x10, R9 - IMULQ DI, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x34, R10 - MOVL SI, 24(SP)(R9*4) - MOVL BP, 65560(SP)(R10*4) - JMP search_loop_encodeBetterBlockAsm12B +match_nolit_dst_ok_encodeSnappyBlockAsm12B: + MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, BP + SHLQ $0x18, DI + IMULQ R8, DI + SHRQ $0x34, DI + SHLQ $0x18, BP + IMULQ R8, BP + SHRQ $0x34, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeSnappyBlockAsm12B + INCL CX + JMP search_loop_encodeSnappyBlockAsm12B -emit_remainder_encodeBetterBlockAsm12B: +emit_remainder_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm12B + JL emit_remainder_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBetterBlockAsm12B: +emit_remainder_ok_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm12B + JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B -two_bytes_emit_remainder_encodeBetterBlockAsm12B: +two_bytes_emit_remainder_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B + JL memmove_emit_remainder_encodeSnappyBlockAsm12B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B -one_byte_emit_remainder_encodeBetterBlockAsm12B: +one_byte_emit_remainder_encodeSnappyBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBetterBlockAsm12B: +memmove_emit_remainder_encodeSnappyBlockAsm12B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -8825,13 +9802,15 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B -memmove_long_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeSnappyBlockAsm12B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -8843,11 +9822,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm12B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -8868,37 +9847,37 @@ emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: +emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBetterBlockAsm10B(dst []byte, src []byte) int +// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 +TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX - MOVQ $0x000000a0, CX + MOVQ $0x00000020, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBetterBlockAsm10B: +zero_loop_encodeSnappyBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -8909,7 +9888,7 @@ zero_loop_encodeBetterBlockAsm10B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBetterBlockAsm10B + JNZ zero_loop_encodeSnappyBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -8923,133 +9902,141 @@ zero_loop_encodeBetterBlockAsm10B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBetterBlockAsm10B: +search_loop_encodeSnappyBlockAsm10B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP - LEAL 1(CX)(BP*1), BP + LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B + JGE emit_remainder_encodeSnappyBlockAsm10B MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BP + MOVQ $0x9e3779b1, R8 MOVQ SI, R9 MOVQ SI, R10 - SHLQ $0x10, R9 + SHRQ $0x08, R10 + SHLQ $0x20, R9 IMULQ R8, R9 - SHRQ $0x34, R9 + SHRQ $0x36, R9 SHLQ $0x20, R10 - IMULQ BP, R10 + IMULQ R8, R10 SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BP - MOVL 16408(SP)(R10*4), DI + MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - MOVL CX, 16408(SP)(R10*4) - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R10 + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 - SHRQ $0x08, R9 - CMPL R9, R10 - JNE no_repeat_found_encodeBetterBlockAsm10B + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 + JNE no_repeat_found_encodeSnappyBlockAsm10B LEAL 1(CX), SI MOVL 12(SP), BP MOVL SI, DI SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeBetterBlockAsm10B + JZ repeat_extend_back_end_encodeSnappyBlockAsm10B -repeat_extend_back_loop_encodeBetterBlockAsm10B: +repeat_extend_back_loop_encodeSnappyBlockAsm10B: CMPL SI, BP - JLE repeat_extend_back_end_encodeBetterBlockAsm10B + JLE repeat_extend_back_end_encodeSnappyBlockAsm10B MOVB -1(DX)(DI*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBetterBlockAsm10B + JNE repeat_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(SI), SI DECL DI - JNZ repeat_extend_back_loop_encodeBetterBlockAsm10B + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B -repeat_extend_back_end_encodeBetterBlockAsm10B: +repeat_extend_back_end_encodeSnappyBlockAsm10B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BP*1), R8 SUBL BP, DI LEAL -1(DI), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm10B + JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm10B + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B -two_bytes_repeat_emit_encodeBetterBlockAsm10B: +two_bytes_repeat_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm10B - JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + JL memmove_repeat_emit_encodeSnappyBlockAsm10B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B -one_byte_repeat_emit_encodeBetterBlockAsm10B: +one_byte_repeat_emit_encodeSnappyBlockAsm10B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBetterBlockAsm10B: +memmove_repeat_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: MOVB (R8), R9 MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3: MOVW (R8), R9 MOVB 2(R8), R8 MOVW R9, (AX) MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7: MOVL (R8), R9 MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -9059,12 +10046,14 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B: +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B -memmove_long_repeat_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(DI*1), BP +memmove_long_repeat_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -9076,11 +10065,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm10B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -9101,23 +10090,23 @@ emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBetterBlockAsm10B: +emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -9125,58 +10114,61 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm10B: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeBetterBlockAsm10B + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B -matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B: +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: MOVQ (R8)(R10*1), R9 XORQ (BP)(R10*1), R9 TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm10B + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B BSFQ R9, R9 SARQ $0x03, R9 LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeBetterBlockAsm10B + JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B -matchlen_loop_repeat_extend_encodeBetterBlockAsm10B: +matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B -matchlen_single_repeat_extend_encodeBetterBlockAsm10B: +matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: TESTL DI, DI - JZ repeat_extend_forward_end_encodeBetterBlockAsm10B + JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B -matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B: +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: MOVB (R8)(R10*1), R9 CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeBetterBlockAsm10B + JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B LEAL 1(R10), R10 DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm10B + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B -repeat_extend_forward_end_encodeBetterBlockAsm10B: +repeat_extend_forward_end_encodeSnappyBlockAsm10B: ADDL R10, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI -two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B -two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B: +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -9185,148 +10177,152 @@ two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm10B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm10B + JMP repeat_end_emit_encodeSnappyBlockAsm10B -emit_copy_three_repeat_as_copy_encodeBetterBlockAsm10B: +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBetterBlockAsm10B: +repeat_end_emit_encodeSnappyBlockAsm10B: MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm10B + JMP search_loop_encodeSnappyBlockAsm10B -no_repeat_found_encodeBetterBlockAsm10B: +no_repeat_found_encodeSnappyBlockAsm10B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm10B + JEQ candidate_match_encodeSnappyBlockAsm10B + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidateS_match_encodeBetterBlockAsm10B + JEQ candidate2_match_encodeSnappyBlockAsm10B + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI + JEQ candidate3_match_encodeSnappyBlockAsm10B MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm10B + JMP search_loop_encodeSnappyBlockAsm10B -candidateS_match_encodeBetterBlockAsm10B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL 24(SP)(R9*4), BP - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm10B - DECL CX - MOVL DI, BP +candidate3_match_encodeSnappyBlockAsm10B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm10B -candidate_match_encodeBetterBlockAsm10B: +candidate2_match_encodeSnappyBlockAsm10B: + MOVL R8, 24(SP)(R9*4) + INCL CX + MOVL DI, BP + +candidate_match_encodeSnappyBlockAsm10B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBetterBlockAsm10B + JZ match_extend_back_end_encodeSnappyBlockAsm10B -match_extend_back_loop_encodeBetterBlockAsm10B: +match_extend_back_loop_encodeSnappyBlockAsm10B: CMPL CX, SI - JLE match_extend_back_end_encodeBetterBlockAsm10B + JLE match_extend_back_end_encodeSnappyBlockAsm10B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBetterBlockAsm10B + JNE match_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBetterBlockAsm10B - JMP match_extend_back_loop_encodeBetterBlockAsm10B + JZ match_extend_back_end_encodeSnappyBlockAsm10B + JMP match_extend_back_loop_encodeSnappyBlockAsm10B -match_extend_back_end_encodeBetterBlockAsm10B: +match_extend_back_end_encodeSnappyBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm10B + JL match_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBetterBlockAsm10B: +match_dst_size_check_encodeSnappyBlockAsm10B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm10B + JLT one_byte_match_emit_encodeSnappyBlockAsm10B CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm10B + JLT two_bytes_match_emit_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm10B + JMP memmove_long_match_emit_encodeSnappyBlockAsm10B -two_bytes_match_emit_encodeBetterBlockAsm10B: +two_bytes_match_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_encodeBetterBlockAsm10B + JL memmove_match_emit_encodeSnappyBlockAsm10B + JMP memmove_long_match_emit_encodeSnappyBlockAsm10B -one_byte_match_emit_encodeBetterBlockAsm10B: +one_byte_match_emit_encodeSnappyBlockAsm10B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBetterBlockAsm10B: +memmove_match_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -9336,12 +10332,14 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBetterBlockAsm10B: +memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B -memmove_long_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), DI +memmove_long_match_emit_encodeSnappyBlockAsm10B: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -9353,11 +10351,11 @@ memmove_long_match_emit_encodeBetterBlockAsm10B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -9378,23 +10376,24 @@ emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBetterBlockAsm10B: +emit_literal_done_match_emit_encodeSnappyBlockAsm10B: +match_nolit_loop_encodeSnappyBlockAsm10B: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -9403,208 +10402,202 @@ emit_literal_done_match_emit_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), R8 - XORL R10, R10 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B -matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVQ (DI)(R10*1), R9 - XORQ (R8)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBetterBlockAsm10B +matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm10B -matchlen_loop_match_nolit_encodeBetterBlockAsm10B: +matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: LEAL -8(SI), SI - LEAL 8(R10), R10 + LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B -matchlen_single_match_nolit_encodeBetterBlockAsm10B: +matchlen_single_match_nolit_encodeSnappyBlockAsm10B: TESTL SI, SI - JZ match_nolit_end_encodeBetterBlockAsm10B + JZ match_nolit_end_encodeSnappyBlockAsm10B -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVB (DI)(R10*1), R9 - CMPB (R8)(R10*1), R9 - JNE match_nolit_end_encodeBetterBlockAsm10B - LEAL 1(R10), R10 +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 + JNE match_nolit_end_encodeSnappyBlockAsm10B + LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B -match_nolit_end_encodeBetterBlockAsm10B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 +match_nolit_end_encodeSnappyBlockAsm10B: + ADDL R9, CX + MOVL 16(SP), BP + ADDL $0x04, R9 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeBetterBlockAsm10B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 + MOVW BP, 1(AX) + LEAL -60(R9), R9 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B -two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - CMPL SI, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: + CMPL R9, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) + LEAL -16(BX)(R9*4), R9 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B -emit_copy_three_match_nolit_encodeBetterBlockAsm10B: +emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBetterBlockAsm10B: +match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B - MOVQ -3(DX)(BP*1), SI + JGE emit_remainder_encodeSnappyBlockAsm10B + MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm10B + JL match_nolit_dst_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, DI +match_nolit_dst_ok_encodeSnappyBlockAsm10B: MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -3(BP), R11 - LEAL -2(BP), BP - MOVQ -2(DX)(CX*1), SI - SHLQ $0x10, R9 - IMULQ DI, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x36, R10 - MOVL R11, 24(SP)(R9*4) - MOVL BP, 16408(SP)(R10*4) - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -2(CX), SI - LEAL -1(CX), BP - SHLQ $0x10, R9 - IMULQ DI, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x36, R10 - MOVL SI, 24(SP)(R9*4) - MOVL BP, 16408(SP)(R10*4) - JMP search_loop_encodeBetterBlockAsm10B + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, BP + SHLQ $0x20, DI + IMULQ R8, DI + SHRQ $0x36, DI + SHLQ $0x20, BP + IMULQ R8, BP + SHRQ $0x36, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeSnappyBlockAsm10B + INCL CX + JMP search_loop_encodeSnappyBlockAsm10B -emit_remainder_encodeBetterBlockAsm10B: +emit_remainder_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm10B + JL emit_remainder_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBetterBlockAsm10B: +emit_remainder_ok_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm10B + JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B -two_bytes_emit_remainder_encodeBetterBlockAsm10B: +two_bytes_emit_remainder_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B + JL memmove_emit_remainder_encodeSnappyBlockAsm10B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B -one_byte_emit_remainder_encodeBetterBlockAsm10B: +one_byte_emit_remainder_encodeSnappyBlockAsm10B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBetterBlockAsm10B: +memmove_emit_remainder_encodeSnappyBlockAsm10B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -9614,13 +10607,15 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B -memmove_long_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeSnappyBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -9632,11 +10627,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm10B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -9657,37 +10652,37 @@ emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: +emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBetterBlockAsm8B(dst []byte, src []byte) int +// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 +TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000028, CX + MOVQ $0x00000008, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBetterBlockAsm8B: +zero_loop_encodeSnappyBlockAsm8B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -9698,7 +10693,7 @@ zero_loop_encodeBetterBlockAsm8B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBetterBlockAsm8B + JNZ zero_loop_encodeSnappyBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -9712,133 +10707,141 @@ zero_loop_encodeBetterBlockAsm8B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBetterBlockAsm8B: +search_loop_encodeSnappyBlockAsm8B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x04, BP - LEAL 1(CX)(BP*1), BP + LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm8B + JGE emit_remainder_encodeSnappyBlockAsm8B MOVL BP, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BP + MOVQ $0x9e3779b1, R8 MOVQ SI, R9 MOVQ SI, R10 - SHLQ $0x10, R9 + SHRQ $0x08, R10 + SHLQ $0x20, R9 IMULQ R8, R9 - SHRQ $0x36, R9 + SHRQ $0x38, R9 SHLQ $0x20, R10 - IMULQ BP, R10 + IMULQ R8, R10 SHRQ $0x38, R10 MOVL 24(SP)(R9*4), BP - MOVL 4120(SP)(R10*4), DI + MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) - MOVL CX, 4120(SP)(R10*4) - MOVL CX, R9 - SUBL 16(SP), R9 - MOVL 1(DX)(R9*1), R10 + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 - SHRQ $0x08, R9 - CMPL R9, R10 - JNE no_repeat_found_encodeBetterBlockAsm8B + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 + JNE no_repeat_found_encodeSnappyBlockAsm8B LEAL 1(CX), SI MOVL 12(SP), BP MOVL SI, DI SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeBetterBlockAsm8B + JZ repeat_extend_back_end_encodeSnappyBlockAsm8B -repeat_extend_back_loop_encodeBetterBlockAsm8B: +repeat_extend_back_loop_encodeSnappyBlockAsm8B: CMPL SI, BP - JLE repeat_extend_back_end_encodeBetterBlockAsm8B + JLE repeat_extend_back_end_encodeSnappyBlockAsm8B MOVB -1(DX)(DI*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBetterBlockAsm8B + JNE repeat_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(SI), SI DECL DI - JNZ repeat_extend_back_loop_encodeBetterBlockAsm8B + JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B -repeat_extend_back_end_encodeBetterBlockAsm8B: +repeat_extend_back_end_encodeSnappyBlockAsm8B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BP*1), R8 SUBL BP, DI LEAL -1(DI), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm8B + JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm8B + JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B -two_bytes_repeat_emit_encodeBetterBlockAsm8B: +two_bytes_repeat_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm8B - JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + JL memmove_repeat_emit_encodeSnappyBlockAsm8B + JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B -one_byte_repeat_emit_encodeBetterBlockAsm8B: +one_byte_repeat_emit_encodeSnappyBlockAsm8B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBetterBlockAsm8B: +memmove_repeat_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(DI*1), BP + + // genMemMoveShort CMPQ DI, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ DI, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: MOVB (R8), R9 MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3: MOVW (R8), R9 MOVB 2(R8), R8 MOVW R9, (AX) MOVB R8, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7: MOVL (R8), R9 MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -9848,12 +10851,14 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B: +memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B -memmove_long_repeat_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(DI*1), BP +memmove_long_repeat_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -9865,11 +10870,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm8B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -9890,23 +10895,23 @@ emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBetterBlockAsm8B: +emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -9914,56 +10919,59 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm8B: SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BP*1), BP + + // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_repeat_extend_encodeBetterBlockAsm8B + JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B -matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B: +matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: MOVQ (R8)(R10*1), R9 XORQ (BP)(R10*1), R9 TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm8B + JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B BSFQ R9, R9 SARQ $0x03, R9 LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeBetterBlockAsm8B + JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B -matchlen_loop_repeat_extend_encodeBetterBlockAsm8B: +matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B + JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B -matchlen_single_repeat_extend_encodeBetterBlockAsm8B: +matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: TESTL DI, DI - JZ repeat_extend_forward_end_encodeBetterBlockAsm8B + JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B -matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B: +matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: MOVB (R8)(R10*1), R9 CMPB (BP)(R10*1), R9 - JNE repeat_extend_forward_end_encodeBetterBlockAsm8B + JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B LEAL 1(R10), R10 DECL DI - JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm8B + JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B -repeat_extend_forward_end_encodeBetterBlockAsm8B: +repeat_extend_forward_end_encodeSnappyBlockAsm8B: ADDL R10, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI -two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B + JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B -two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B: +two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B + JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -9972,148 +10980,152 @@ two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm8B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm8B + JMP repeat_end_emit_encodeSnappyBlockAsm8B -emit_copy_three_repeat_as_copy_encodeBetterBlockAsm8B: +emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBetterBlockAsm8B: +repeat_end_emit_encodeSnappyBlockAsm8B: MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm8B + JMP search_loop_encodeSnappyBlockAsm8B -no_repeat_found_encodeBetterBlockAsm8B: +no_repeat_found_encodeSnappyBlockAsm8B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm8B + JEQ candidate_match_encodeSnappyBlockAsm8B + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidateS_match_encodeBetterBlockAsm8B + JEQ candidate2_match_encodeSnappyBlockAsm8B + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI + JEQ candidate3_match_encodeSnappyBlockAsm8B MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm8B + JMP search_loop_encodeSnappyBlockAsm8B -candidateS_match_encodeBetterBlockAsm8B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL 24(SP)(R9*4), BP - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm8B - DECL CX - MOVL DI, BP +candidate3_match_encodeSnappyBlockAsm8B: + ADDL $0x02, CX + JMP candidate_match_encodeSnappyBlockAsm8B -candidate_match_encodeBetterBlockAsm8B: +candidate2_match_encodeSnappyBlockAsm8B: + MOVL R8, 24(SP)(R9*4) + INCL CX + MOVL DI, BP + +candidate_match_encodeSnappyBlockAsm8B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBetterBlockAsm8B + JZ match_extend_back_end_encodeSnappyBlockAsm8B -match_extend_back_loop_encodeBetterBlockAsm8B: +match_extend_back_loop_encodeSnappyBlockAsm8B: CMPL CX, SI - JLE match_extend_back_end_encodeBetterBlockAsm8B + JLE match_extend_back_end_encodeSnappyBlockAsm8B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBetterBlockAsm8B + JNE match_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBetterBlockAsm8B - JMP match_extend_back_loop_encodeBetterBlockAsm8B + JZ match_extend_back_end_encodeSnappyBlockAsm8B + JMP match_extend_back_loop_encodeSnappyBlockAsm8B -match_extend_back_end_encodeBetterBlockAsm8B: +match_extend_back_end_encodeSnappyBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm8B + JL match_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBetterBlockAsm8B: +match_dst_size_check_encodeSnappyBlockAsm8B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B + JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm8B + JLT one_byte_match_emit_encodeSnappyBlockAsm8B CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm8B + JLT two_bytes_match_emit_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm8B + JMP memmove_long_match_emit_encodeSnappyBlockAsm8B -two_bytes_match_emit_encodeBetterBlockAsm8B: +two_bytes_match_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_encodeBetterBlockAsm8B + JL memmove_match_emit_encodeSnappyBlockAsm8B + JMP memmove_long_match_emit_encodeSnappyBlockAsm8B -one_byte_match_emit_encodeBetterBlockAsm8B: +one_byte_match_emit_encodeSnappyBlockAsm8B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBetterBlockAsm8B: +memmove_match_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(R8*1), DI + + // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -10123,12 +11135,14 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBetterBlockAsm8B: +memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B + JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B -memmove_long_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), DI +memmove_long_match_emit_encodeSnappyBlockAsm8B: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -10140,11 +11154,11 @@ memmove_long_match_emit_encodeBetterBlockAsm8B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOU 32(R9), X6 @@ -10165,23 +11179,24 @@ emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: ADDQ $0x80, R9 ADDQ $0x80, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBetterBlockAsm8B: +emit_literal_done_match_emit_encodeSnappyBlockAsm8B: +match_nolit_loop_encodeSnappyBlockAsm8B: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -10190,206 +11205,200 @@ emit_literal_done_match_emit_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI - LEAQ (DX)(BP*1), R8 - XORL R10, R10 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B -matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVQ (DI)(R10*1), R9 - XORQ (R8)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBetterBlockAsm8B +matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 + JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm8B -matchlen_loop_match_nolit_encodeBetterBlockAsm8B: +matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: LEAL -8(SI), SI - LEAL 8(R10), R10 + LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B + JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B -matchlen_single_match_nolit_encodeBetterBlockAsm8B: +matchlen_single_match_nolit_encodeSnappyBlockAsm8B: TESTL SI, SI - JZ match_nolit_end_encodeBetterBlockAsm8B + JZ match_nolit_end_encodeSnappyBlockAsm8B -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVB (DI)(R10*1), R9 - CMPB (R8)(R10*1), R9 - JNE match_nolit_end_encodeBetterBlockAsm8B - LEAL 1(R10), R10 +matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 + JNE match_nolit_end_encodeSnappyBlockAsm8B + LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B + JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B -match_nolit_end_encodeBetterBlockAsm8B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 +match_nolit_end_encodeSnappyBlockAsm8B: + ADDL R9, CX + MOVL 16(SP), BP + ADDL $0x04, R9 MOVL CX, 12(SP) -two_byte_offset_match_nolit_encodeBetterBlockAsm8B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B + // emitCopy +two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(R10), R10 + MOVW BP, 1(AX) + LEAL -60(R9), R9 ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B + JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B -two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: - CMPL R10, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B +two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: + CMPL R9, $0x0c + JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B MOVB $0x01, BL - LEAL -16(BX)(R10*4), R10 - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (AX) + LEAL -16(BX)(R9*4), R9 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B -emit_copy_three_match_nolit_encodeBetterBlockAsm8B: +emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: MOVB $0x02, BL - LEAL -4(BX)(R10*4), R10 - MOVB R10, (AX) - MOVW SI, 1(AX) + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBetterBlockAsm8B: +match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm8B - MOVQ -3(DX)(BP*1), SI + JGE emit_remainder_encodeSnappyBlockAsm8B + MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm8B + JL match_nolit_dst_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, DI +match_nolit_dst_ok_encodeSnappyBlockAsm8B: MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -3(BP), R11 - LEAL -2(BP), BP - MOVQ -2(DX)(CX*1), SI - SHLQ $0x10, R9 - IMULQ DI, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x38, R10 - MOVL R11, 24(SP)(R9*4) - MOVL BP, 4120(SP)(R10*4) - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - LEAL -2(CX), SI - LEAL -1(CX), BP - SHLQ $0x10, R9 - IMULQ DI, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x38, R10 - MOVL SI, 24(SP)(R9*4) - MOVL BP, 4120(SP)(R10*4) - JMP search_loop_encodeBetterBlockAsm8B + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, BP + SHLQ $0x20, DI + IMULQ R8, DI + SHRQ $0x38, DI + SHLQ $0x20, BP + IMULQ R8, BP + SHRQ $0x38, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeSnappyBlockAsm8B + INCL CX + JMP search_loop_encodeSnappyBlockAsm8B -emit_remainder_encodeBetterBlockAsm8B: +emit_remainder_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm8B + JL emit_remainder_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBetterBlockAsm8B: +emit_remainder_ok_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm8B + JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B + JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B -two_bytes_emit_remainder_encodeBetterBlockAsm8B: +two_bytes_emit_remainder_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B + JL memmove_emit_remainder_encodeSnappyBlockAsm8B + JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B -one_byte_emit_remainder_encodeBetterBlockAsm8B: +one_byte_emit_remainder_encodeSnappyBlockAsm8B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBetterBlockAsm8B: +memmove_emit_remainder_encodeSnappyBlockAsm8B: LEAQ (AX)(BP*1), DX MOVL BP, BX + + // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -10399,13 +11408,15 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: +memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B + JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B -memmove_long_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(BP*1), DX - MOVL BP, BX +memmove_long_emit_remainder_encodeSnappyBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -10417,11 +11428,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm8B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOU 32(BP), X6 @@ -10442,23 +11453,23 @@ emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: ADDQ $0x80, BP ADDQ $0x80, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: +emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) @@ -10521,6 +11532,7 @@ one_byte_standalone: ADDQ $0x01, AX memmove_standalone: + // genMemMoveShort CMPQ DX, $0x03 JB emit_lit_memmove_standalone_memmove_move_1or2 JE emit_lit_memmove_standalone_memmove_move_3 @@ -10580,6 +11592,7 @@ emit_lit_memmove_standalone_memmove_move_33through64: JMP emit_literal_end_standalone memmove_long_standalone: + // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 @@ -10647,6 +11660,7 @@ TEXT ·emitRepeat(SB), NOSPLIT, $0-48 MOVQ offset+24(FP), CX MOVQ length+32(FP), DX + // emitRepeat emit_repeat_again_standalone: MOVL DX, BP LEAL -4(DX), DX @@ -10728,6 +11742,8 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48 MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX + + // emitCopy CMPL CX, $0x00010000 JL two_byte_offset_standalone @@ -10742,6 +11758,7 @@ four_bytes_loop_back_standalone: CMPL DX, $0x04 JL four_bytes_remain_standalone + // emitRepeat emit_repeat_again_standalone_emit_copy: MOVL DX, BP LEAL -4(DX), DX @@ -10835,6 +11852,7 @@ two_byte_offset_standalone: ADDQ $0x03, AX ADDQ $0x03, BX + // emitRepeat emit_repeat_again_standalone_emit_copy_short: MOVL DX, BP LEAL -4(DX), DX @@ -10942,6 +11960,8 @@ TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX + + // emitCopy CMPL CX, $0x00010000 JL two_byte_offset_standalone_snappy @@ -11011,6 +12031,8 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX MOVQ a_len+8(FP), DX + + // matchLen XORL BP, BP CMPL DX, $0x08 JL matchlen_single_standalone From 0f5512cc2687d69d45c3408523d0424f6b928331 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 19 Feb 2021 11:57:07 +0100 Subject: [PATCH 03/10] Make Go code closer to assembly, skipping less on inefficient encodes. --- s2/encode_better.go | 5 +++-- s2/s2_test.go | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/s2/encode_better.go b/s2/encode_better.go index 636859aa1d..24914a0693 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -83,9 +83,10 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { for { candidateL := 0 + nextS := 0 for { // Next src position to check - nextS := s + (s-nextEmit)>>7 + 1 + nextS = s + (s-nextEmit)>>7 + 1 if nextS > sLimit { goto emitRemainder } @@ -185,7 +186,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { if offset > 65535 && s-base <= 5 { // Bail if the match is equal or worse to the encoding. - s = base + 3 + s = nextS + 1 if s >= sLimit { goto emitRemainder } diff --git a/s2/s2_test.go b/s2/s2_test.go index 2fc5b1da36..132050d488 100644 --- a/s2/s2_test.go +++ b/s2/s2_test.go @@ -1330,12 +1330,26 @@ func benchDecode(b *testing.B, src []byte) { func benchEncode(b *testing.B, src []byte) { // Bandwidth is in amount of uncompressed data. - b.SetBytes(int64(len(src))) dst := make([]byte, MaxEncodedLen(len(src))) b.ResetTimer() - for i := 0; i < b.N; i++ { - Encode(dst, src) - } + b.Run("default", func(b *testing.B) { + b.SetBytes(int64(len(src))) + for i := 0; i < b.N; i++ { + Encode(dst, src) + } + }) + b.Run("better", func(b *testing.B) { + b.SetBytes(int64(len(src))) + for i := 0; i < b.N; i++ { + EncodeBetter(dst, src) + } + }) + b.Run("best", func(b *testing.B) { + b.SetBytes(int64(len(src))) + for i := 0; i < b.N; i++ { + EncodeBest(dst, src) + } + }) } func benchEncodeBetter(b *testing.B, src []byte) { From 26ec8d9fa4a9310d50128b00b6312e21de48d685 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 19 Feb 2021 12:24:22 +0100 Subject: [PATCH 04/10] Update benchies --- s2/README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/s2/README.md b/s2/README.md index 479e824778..601bd397a4 100644 --- a/s2/README.md +++ b/s2/README.md @@ -8,7 +8,7 @@ Decoding is compatible with Snappy compressed content, but content compressed wi This means that S2 can seamlessly replace Snappy without converting compressed content. S2 is designed to have high throughput on content that cannot be compressed. -This is important so you don't have to worry about spending CPU cycles on already compressed data. +This is important, so you don't have to worry about spending CPU cycles on already compressed data. ## Benefits over Snappy @@ -456,33 +456,33 @@ This will compress as much as possible with little regard to CPU usage. Mainly for offline compression, but where decompression speed should still be high and compatible with other S2 compressed data. -Some examples compared on 16 core CPU: +Some examples compared on 16 core CPU, amd64 assembly used: ``` * enwik10 Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s -Better... 10000000000 -> 4225922984 [42.26%]; 2.817s, 3385.4MB/s -Best... 10000000000 -> 3667646858 [36.68%]; 35.995s, 264.9MB/s +Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s +Best... 10000000000 -> 3667646858 [36.68%]; 35.995s, 264.9MB/s * github-june-2days-2019.json Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s -Better... 6273951764 -> 950079555 [15.14%]; 736ms, 8129.5MB/s -Best... 6273951764 -> 846260870 [13.49%]; 8.125s, 736.4MB/s +Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s +Best... 6273951764 -> 846260870 [13.49%]; 8.125s, 736.4MB/s * nyc-taxi-data-10M.csv Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s -Better... 3325605752 -> 960330423 [28.88%]; 602ms, 5268.4MB/s -Best... 3325605752 -> 794873295 [23.90%]; 6.619s, 479.1MB/s +Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s +Best... 3325605752 -> 794873295 [23.90%]; 6.619s, 479.1MB/s * 10gb.tar Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s -Better... 10065157632 -> 5650133605 [56.14%]; 2.172s, 4419.4MB/s -Best... 10065157632 -> 5246578570 [52.13%]; 25.696s, 373.6MB/s +Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s +Best... 10065157632 -> 5246578570 [52.13%]; 25.696s, 373.6MB/s * consensus.db.10gb Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s -Better... 10737418240 -> 4542443833 [42.30%]; 3.3s, 3103.5MB/s -Best... 10737418240 -> 4272335558 [39.79%]; 38.955s, 262.9MB/s +Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s +Best... 10737418240 -> 4272335558 [39.79%]; 38.955s, 262.9MB/s ``` Decompression speed should be around the same as using the 'better' compression mode. From 5587424ada7a2612192b51ffeea14d2d82c51bd3 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 19 Feb 2021 12:41:04 +0100 Subject: [PATCH 05/10] Omit check for inefficient encoding when impossible. --- s2/_generate/gen.go | 2 +- s2/encodeblock_amd64.s | 27 --------------------------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index d663225742..ed5f702437 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -1191,7 +1191,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash MOVL(s, repeatVal) SUBL(candidate, repeatVal) // Check if match is better.. - if true { + if o.maxLen > 65535 { CMPL(length.As32(), U8(1)) JG(LabelRef("match_length_ok_" + name)) CMPL(repeatVal, U32(65535)) diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index c78bace185..b2ee175b0f 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -5816,15 +5816,6 @@ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: match_nolit_end_encodeBetterBlockAsm12B: MOVL CX, DI SUBL BP, DI - CMPL R11, $0x01 - JG match_length_ok_encodeBetterBlockAsm12B - CMPL DI, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm12B - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm12B - -match_length_ok_encodeBetterBlockAsm12B: MOVL DI, 16(SP) MOVL 12(SP), BP CMPL BP, SI @@ -6767,15 +6758,6 @@ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: match_nolit_end_encodeBetterBlockAsm10B: MOVL CX, DI SUBL BP, DI - CMPL R11, $0x01 - JG match_length_ok_encodeBetterBlockAsm10B - CMPL DI, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm10B - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm10B - -match_length_ok_encodeBetterBlockAsm10B: MOVL DI, 16(SP) MOVL 12(SP), BP CMPL BP, SI @@ -7708,15 +7690,6 @@ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: match_nolit_end_encodeBetterBlockAsm8B: MOVL CX, DI SUBL BP, DI - CMPL R11, $0x01 - JG match_length_ok_encodeBetterBlockAsm8B - CMPL DI, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm8B - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm8B - -match_length_ok_encodeBetterBlockAsm8B: MOVL DI, 16(SP) MOVL 12(SP), BP CMPL BP, SI From 8ba1d1b745bddec49cde69153b68615576a39105 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 23 Feb 2021 12:04:55 +0100 Subject: [PATCH 06/10] Bigger dst limit. --- s2/_generate/gen.go | 4 ++-- s2/encode_better.go | 2 +- s2/encodeblock_amd64.s | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index ed5f702437..c8e3522960 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -822,7 +822,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash const inputMargin = 8 tmp, tmp2, tmp3 := GP64(), GP64(), GP64() MOVQ(lenSrcQ, tmp) - LEAQ(Mem{Base: tmp, Disp: -5}, tmp2) + LEAQ(Mem{Base: tmp, Disp: -6}, tmp2) // sLimitL := len(src) - inputMargin LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) @@ -1225,7 +1225,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash CMPL(s.As32(), sLimitL) JGE(LabelRef("emit_remainder_" + name)) } - // Start load candidate+1 as early as possible... + // Bail if we exceed the maximum size. { CMPQ(dst, dstLimitPtrQ) diff --git a/s2/encode_better.go b/s2/encode_better.go index 24914a0693..13e7d4eada 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -68,7 +68,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { } // Bail if we can't compress to at least this. - dstLimit := len(src) - len(src)>>5 - 5 + dstLimit := len(src) - len(src)>>5 - 6 // nextEmit is where in src the next emitLiteral should start from. nextEmit := 0 diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index b2ee175b0f..13350f865a 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -4080,7 +4080,7 @@ zero_loop_encodeBetterBlockAsm: JNZ zero_loop_encodeBetterBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX + LEAQ -6(CX), DX LEAQ -8(CX), BP MOVL BP, 8(SP) SHRQ $0x05, CX @@ -5339,7 +5339,7 @@ zero_loop_encodeBetterBlockAsm12B: JNZ zero_loop_encodeBetterBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX + LEAQ -6(CX), DX LEAQ -8(CX), BP MOVL BP, 8(SP) SHRQ $0x05, CX @@ -6281,7 +6281,7 @@ zero_loop_encodeBetterBlockAsm10B: JNZ zero_loop_encodeBetterBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX + LEAQ -6(CX), DX LEAQ -8(CX), BP MOVL BP, 8(SP) SHRQ $0x05, CX @@ -7223,7 +7223,7 @@ zero_loop_encodeBetterBlockAsm8B: JNZ zero_loop_encodeBetterBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX - LEAQ -5(CX), DX + LEAQ -6(CX), DX LEAQ -8(CX), BP MOVL BP, 8(SP) SHRQ $0x05, CX From 587204ab8e90e07ecb90864460f2ecacf5424de2 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 24 Feb 2021 17:55:55 +0100 Subject: [PATCH 07/10] Fix memcopy writing out-of-bounds. --- s2/_generate/gen.go | 50 +-- s2/encodeblock_amd64.s | 788 +++++++++-------------------------------- 2 files changed, 200 insertions(+), 638 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index c8e3522960..d99a6c2aaa 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -108,8 +108,8 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") Pragma("noescape") - const literalMaxOverhead = 4 o.maxLen = maxLen + var literalMaxOverhead = maxLitOverheadFor(maxLen) var tableSize = 4 * (1 << tableBits) // Memzero needs at least 128 bytes. @@ -727,6 +727,22 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m RET() } +func maxLitOverheadFor(n int) int { + switch { + case n == 0: + return 0 + case n < 60: + return 1 + case n < 1<<8: + return 2 + case n < 1<<16: + return 3 + case n < 1<<24: + return 4 + } + return 5 +} + func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) { TEXT(name, 0, "func(dst, src []byte) int") Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", @@ -737,7 +753,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash if lHashBytes > 7 || lHashBytes <= 4 { panic("lHashBytes must be <= 7 and >4") } - const literalMaxOverhead = 4 + var literalMaxOverhead = maxLitOverheadFor(maxLen) + var sTableBits = lTableBits - 2 const sHashBytes = 4 o.maxLen = maxLen @@ -1597,7 +1614,7 @@ func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.G } JMP(end) - // > 32 bytes + // > 64 bytes Label("memmove_long_" + name) // copy(dst[i:], lit) @@ -2259,8 +2276,9 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end // Store start and end for sse_tail Label(name + "forward_sse") - X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() - X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM() + X0, X1, X2, X3, X4, X5 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM() + // X6, X7 := XMM(), XMM() + //X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM() MOVOU(Mem{Base: src}, X0) MOVOU(Mem{Base: src, Disp: 16}, X1) @@ -2271,7 +2289,7 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end dstAlign := GP64() bigLoops := GP64() MOVQ(length, bigLoops) - SHRQ(U8(7), bigLoops) // bigLoops = length / 128 + SHRQ(U8(5), bigLoops) // bigLoops = length / 32 MOVQ(dst, dstAlign) ANDL(U32(31), dstAlign.As32()) @@ -2279,7 +2297,7 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end MOVQ(U32(64), srcOff) SUBQ(dstAlign, srcOff) - // Move 128 bytes/loop + // Move 32 bytes/loop DECQ(bigLoops) JA(LabelRef(name + "forward_sse_loop_32")) @@ -2293,24 +2311,12 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end MOVOU(Mem{Disp: 0, Base: srcPos}, X4) MOVOU(Mem{Disp: 16, Base: srcPos}, X5) - MOVOU(Mem{Disp: 32, Base: srcPos}, X6) - MOVOU(Mem{Disp: 48, Base: srcPos}, X7) - MOVOU(Mem{Disp: 64, Base: srcPos}, X8) - MOVOU(Mem{Disp: 80, Base: srcPos}, X9) - MOVOU(Mem{Disp: 96, Base: srcPos}, X10) - MOVOU(Mem{Disp: 112, Base: srcPos}, X11) MOVOA(X4, Mem{Disp: 0, Base: dstPos}) MOVOA(X5, Mem{Disp: 16, Base: dstPos}) - MOVOA(X6, Mem{Disp: 32, Base: dstPos}) - MOVOA(X7, Mem{Disp: 48, Base: dstPos}) - MOVOA(X8, Mem{Disp: 64, Base: dstPos}) - MOVOA(X9, Mem{Disp: 80, Base: dstPos}) - MOVOA(X10, Mem{Disp: 96, Base: dstPos}) - MOVOA(X11, Mem{Disp: 112, Base: dstPos}) - ADDQ(U8(128), dstPos) - ADDQ(U8(128), srcPos) - ADDQ(U8(128), srcOff) // This could be outside the loop, but we lose a reg if we do. + ADDQ(U8(32), dstPos) + ADDQ(U8(32), srcPos) + ADDQ(U8(32), srcOff) // This could be outside the loop, but we lose a reg if we do. DECQ(bigLoops) JNA(LabelRef(name + "big_loop_back")) diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 13350f865a..99f45d0c05 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -215,7 +215,7 @@ memmove_long_repeat_emit_encodeBlockAsm: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -228,23 +228,11 @@ memmove_long_repeat_emit_encodeBlockAsm: emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back @@ -618,7 +606,7 @@ match_extend_back_loop_encodeBlockAsm: match_extend_back_end_encodeBlockAsm: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -748,7 +736,7 @@ memmove_long_match_emit_encodeBlockAsm: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -761,23 +749,11 @@ memmove_long_match_emit_encodeBlockAsm: emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back @@ -1069,7 +1045,7 @@ match_nolit_dst_ok_encodeBlockAsm: emit_remainder_encodeBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -1201,7 +1177,7 @@ memmove_long_emit_remainder_encodeBlockAsm: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -1214,23 +1190,11 @@ memmove_long_emit_remainder_encodeBlockAsm: emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back @@ -1444,7 +1408,7 @@ memmove_long_repeat_emit_encodeBlockAsm12B: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -1457,23 +1421,11 @@ memmove_long_repeat_emit_encodeBlockAsm12B: emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back @@ -1710,7 +1662,7 @@ match_extend_back_loop_encodeBlockAsm12B: match_extend_back_end_encodeBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -1821,7 +1773,7 @@ memmove_long_match_emit_encodeBlockAsm12B: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -1834,23 +1786,11 @@ memmove_long_match_emit_encodeBlockAsm12B: emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back @@ -2028,7 +1968,7 @@ match_nolit_dst_ok_encodeBlockAsm12B: emit_remainder_encodeBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -2141,7 +2081,7 @@ memmove_long_emit_remainder_encodeBlockAsm12B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -2154,23 +2094,11 @@ memmove_long_emit_remainder_encodeBlockAsm12B: emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back @@ -2384,7 +2312,7 @@ memmove_long_repeat_emit_encodeBlockAsm10B: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -2397,23 +2325,11 @@ memmove_long_repeat_emit_encodeBlockAsm10B: emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back @@ -2650,7 +2566,7 @@ match_extend_back_loop_encodeBlockAsm10B: match_extend_back_end_encodeBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -2761,7 +2677,7 @@ memmove_long_match_emit_encodeBlockAsm10B: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -2774,23 +2690,11 @@ memmove_long_match_emit_encodeBlockAsm10B: emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back @@ -2968,7 +2872,7 @@ match_nolit_dst_ok_encodeBlockAsm10B: emit_remainder_encodeBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -3081,7 +2985,7 @@ memmove_long_emit_remainder_encodeBlockAsm10B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -3094,23 +2998,11 @@ memmove_long_emit_remainder_encodeBlockAsm10B: emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back @@ -3324,7 +3216,7 @@ memmove_long_repeat_emit_encodeBlockAsm8B: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -3337,23 +3229,11 @@ memmove_long_repeat_emit_encodeBlockAsm8B: emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back @@ -3580,7 +3460,7 @@ match_extend_back_loop_encodeBlockAsm8B: match_extend_back_end_encodeBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -3691,7 +3571,7 @@ memmove_long_match_emit_encodeBlockAsm8B: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -3704,23 +3584,11 @@ memmove_long_match_emit_encodeBlockAsm8B: emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back @@ -3892,7 +3760,7 @@ match_nolit_dst_ok_encodeBlockAsm8B: emit_remainder_encodeBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -4005,7 +3873,7 @@ memmove_long_emit_remainder_encodeBlockAsm8B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -4018,23 +3886,11 @@ memmove_long_emit_remainder_encodeBlockAsm8B: emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back @@ -4261,7 +4117,7 @@ memmove_long_repeat_emit_encodeBetterBlockAsm: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -4274,23 +4130,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm: emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back @@ -4662,7 +4506,7 @@ match_extend_back_loop_encodeBetterBlockAsm: match_extend_back_end_encodeBetterBlockAsm: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -4845,7 +4689,7 @@ memmove_long_match_emit_encodeBetterBlockAsm: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 @@ -4858,23 +4702,11 @@ memmove_long_match_emit_encodeBetterBlockAsm: emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R9 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back @@ -5132,7 +4964,7 @@ match_nolit_dst_ok_encodeBetterBlockAsm: emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -5264,7 +5096,7 @@ memmove_long_emit_remainder_encodeBetterBlockAsm: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -5277,23 +5109,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm: emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back @@ -5501,7 +5321,7 @@ memmove_long_repeat_emit_encodeBetterBlockAsm12B: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -5514,23 +5334,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm12B: emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back @@ -5765,7 +5573,7 @@ match_extend_back_loop_encodeBetterBlockAsm12B: match_extend_back_end_encodeBetterBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -5920,7 +5728,7 @@ memmove_long_match_emit_encodeBetterBlockAsm12B: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 @@ -5933,23 +5741,11 @@ memmove_long_match_emit_encodeBetterBlockAsm12B: emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R9 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back @@ -6093,7 +5889,7 @@ match_nolit_dst_ok_encodeBetterBlockAsm12B: emit_remainder_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -6206,7 +6002,7 @@ memmove_long_emit_remainder_encodeBetterBlockAsm12B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -6219,23 +6015,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm12B: emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back @@ -6443,7 +6227,7 @@ memmove_long_repeat_emit_encodeBetterBlockAsm10B: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -6456,23 +6240,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm10B: emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back @@ -6707,7 +6479,7 @@ match_extend_back_loop_encodeBetterBlockAsm10B: match_extend_back_end_encodeBetterBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -6862,7 +6634,7 @@ memmove_long_match_emit_encodeBetterBlockAsm10B: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 @@ -6875,23 +6647,11 @@ memmove_long_match_emit_encodeBetterBlockAsm10B: emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R9 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back @@ -7035,7 +6795,7 @@ match_nolit_dst_ok_encodeBetterBlockAsm10B: emit_remainder_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -7148,7 +6908,7 @@ memmove_long_emit_remainder_encodeBetterBlockAsm10B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -7161,23 +6921,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm10B: emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back @@ -7385,7 +7133,7 @@ memmove_long_repeat_emit_encodeBetterBlockAsm8B: MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 - SHRQ $0x07, R11 + SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 @@ -7398,23 +7146,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm8B: emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 - MOVOU 32(R10), X6 - MOVOU 48(R10), X7 - MOVOU 64(R10), X8 - MOVOU 80(R10), X9 - MOVOU 96(R10), X10 - MOVOU 112(R10), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R10 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back @@ -7639,7 +7375,7 @@ match_extend_back_loop_encodeBetterBlockAsm8B: match_extend_back_end_encodeBetterBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -7794,7 +7530,7 @@ memmove_long_match_emit_encodeBetterBlockAsm8B: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 @@ -7807,23 +7543,11 @@ memmove_long_match_emit_encodeBetterBlockAsm8B: emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R13) MOVOA X5, 16(R13) - MOVOA X6, 32(R13) - MOVOA X7, 48(R13) - MOVOA X8, 64(R13) - MOVOA X9, 80(R13) - MOVOA X10, 96(R13) - MOVOA X11, 112(R13) - ADDQ $0x80, R13 - ADDQ $0x80, R9 - ADDQ $0x80, R12 + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back @@ -7961,7 +7685,7 @@ match_nolit_dst_ok_encodeBetterBlockAsm8B: emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -8074,7 +7798,7 @@ memmove_long_emit_remainder_encodeBetterBlockAsm8B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -8087,23 +7811,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm8B: emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back @@ -8336,7 +8048,7 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -8349,23 +8061,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back @@ -8535,7 +8235,7 @@ match_extend_back_loop_encodeSnappyBlockAsm: match_extend_back_end_encodeSnappyBlockAsm: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -8665,7 +8365,7 @@ memmove_long_match_emit_encodeSnappyBlockAsm: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -8678,23 +8378,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back @@ -8852,7 +8540,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm: emit_remainder_encodeSnappyBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) @@ -8984,7 +8672,7 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -8997,23 +8685,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back @@ -9227,7 +8903,7 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm12B: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -9240,23 +8916,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm12B: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back @@ -9402,7 +9066,7 @@ match_extend_back_loop_encodeSnappyBlockAsm12B: match_extend_back_end_encodeSnappyBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -9513,7 +9177,7 @@ memmove_long_match_emit_encodeSnappyBlockAsm12B: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -9526,23 +9190,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm12B: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back @@ -9676,7 +9328,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm12B: emit_remainder_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) @@ -9789,7 +9441,7 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm12B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -9802,23 +9454,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm12B: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back @@ -10032,7 +9672,7 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm10B: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -10045,23 +9685,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm10B: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back @@ -10207,7 +9835,7 @@ match_extend_back_loop_encodeSnappyBlockAsm10B: match_extend_back_end_encodeSnappyBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -10318,7 +9946,7 @@ memmove_long_match_emit_encodeSnappyBlockAsm10B: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -10331,23 +9959,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm10B: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back @@ -10481,7 +10097,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm10B: emit_remainder_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) @@ -10594,7 +10210,7 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm10B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -10607,23 +10223,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm10B: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back @@ -10837,7 +10441,7 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm8B: MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -10850,23 +10454,11 @@ memmove_long_repeat_emit_encodeSnappyBlockAsm8B: emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back @@ -11010,7 +10602,7 @@ match_extend_back_loop_encodeSnappyBlockAsm8B: match_extend_back_end_encodeSnappyBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI + LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -11121,7 +10713,7 @@ memmove_long_match_emit_encodeSnappyBlockAsm8B: MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 - SHRQ $0x07, R10 + SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 @@ -11134,23 +10726,11 @@ memmove_long_match_emit_encodeSnappyBlockAsm8B: emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 - MOVOU 32(R9), X6 - MOVOU 48(R9), X7 - MOVOU 64(R9), X8 - MOVOU 80(R9), X9 - MOVOU 96(R9), X10 - MOVOU 112(R9), X11 MOVOA X4, (R12) MOVOA X5, 16(R12) - MOVOA X6, 32(R12) - MOVOA X7, 48(R12) - MOVOA X8, 64(R12) - MOVOA X9, 80(R12) - MOVOA X10, 96(R12) - MOVOA X11, 112(R12) - ADDQ $0x80, R12 - ADDQ $0x80, R9 - ADDQ $0x80, R11 + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back @@ -11282,7 +10862,7 @@ match_nolit_dst_ok_encodeSnappyBlockAsm8B: emit_remainder_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX + LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) @@ -11395,7 +10975,7 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm8B: MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -11408,23 +10988,11 @@ memmove_long_emit_remainder_encodeSnappyBlockAsm8B: emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back @@ -11571,7 +11139,7 @@ memmove_long_standalone: MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVQ DX, SI - SHRQ $0x07, SI + SHRQ $0x05, SI MOVQ AX, BP ANDL $0x0000001f, BP MOVQ $0x00000040, DI @@ -11584,23 +11152,11 @@ memmove_long_standalone: emit_lit_memmove_long_standalonelarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 - MOVOU 32(BP), X6 - MOVOU 48(BP), X7 - MOVOU 64(BP), X8 - MOVOU 80(BP), X9 - MOVOU 96(BP), X10 - MOVOU 112(BP), X11 MOVOA X4, (R8) MOVOA X5, 16(R8) - MOVOA X6, 32(R8) - MOVOA X7, 48(R8) - MOVOA X8, 64(R8) - MOVOA X9, 80(R8) - MOVOA X10, 96(R8) - MOVOA X11, 112(R8) - ADDQ $0x80, R8 - ADDQ $0x80, BP - ADDQ $0x80, DI + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI DECQ SI JNA emit_lit_memmove_long_standalonelarge_big_loop_back From e2bf0147d1b90f76911dffc3ac70d763ff65edee Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 24 Feb 2021 18:43:48 +0100 Subject: [PATCH 08/10] Benchmark decompression time as well. --- s2/cmd/s2c/main.go | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/s2/cmd/s2c/main.go b/s2/cmd/s2c/main.go index 90c128b7fa..64483fbbb9 100644 --- a/s2/cmd/s2c/main.go +++ b/s2/cmd/s2c/main.go @@ -2,6 +2,7 @@ package main import ( "bufio" + "bytes" "errors" "flag" "fmt" @@ -138,6 +139,7 @@ Options:`) *quiet = *quiet || *stdout if *bench > 0 { debug.SetGCPercent(10) + dec := s2.NewReader(nil) for _, filename := range files { func() { if !*quiet { @@ -152,8 +154,17 @@ Options:`) _, err = io.ReadFull(file, b) exitErr(err) file.Close() + var buf *bytes.Buffer for i := 0; i < *bench; i++ { - w, errFn := verifyTo(ioutil.Discard) + w := ioutil.Discard + // Verify with this buffer... + if *verify { + if buf == nil { + buf = bytes.NewBuffer(make([]byte, 0, len(b)+(len(b)>>8))) + } + buf.Reset() + w = buf + } wc := wCounter{out: w} if !*quiet { fmt.Print("\nCompressing...") @@ -172,7 +183,27 @@ Options:`) ms := elapsed.Round(time.Millisecond) fmt.Printf(" %d -> %d [%.02f%%]; %v, %.01fMB/s", input, wc.n, pct, ms, mbpersec) } - exitErr(errFn()) + if *verify { + if !*quiet { + fmt.Print("\nDecompressing.") + } + start := time.Now() + dec.Reset(buf) + n, err := io.Copy(ioutil.Discard, dec) + exitErr(err) + if int(n) != len(b) { + exitErr(fmt.Errorf("unexpected size, want %d, got %d", len(b), n)) + } + if !*quiet { + input := len(b) + elapsed := time.Since(start) + mbpersec := (float64(input) / (1024 * 1024)) / (float64(elapsed) / (float64(time.Second))) + pct := float64(input) * 100 / float64(wc.n) + ms := elapsed.Round(time.Millisecond) + fmt.Printf(" %d -> %d [%.02f%%]; %v, %.01fMB/s", wc.n, n, pct, ms, mbpersec) + } + dec.Reset(nil) + } } fmt.Println("") wr.Close() From 164dbe0a99cc2c83307871cf0ab017f9e3621821 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 25 Feb 2021 10:17:26 +0100 Subject: [PATCH 09/10] Generate for max 4MB blocks. --- s2/_generate/gen.go | 2 + s2/encode_amd64.go | 10 +- s2/encodeblock_amd64.go | 14 + s2/encodeblock_amd64.s | 4473 +++++++++++++++++++++++++++++---------- 4 files changed, 3399 insertions(+), 1100 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index d99a6c2aaa..01830e1ff4 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -35,11 +35,13 @@ func main() { snappy: false, } o.genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, limit14B) + o.genEncodeBlockAsm("encodeBlockAsm4MB", 14, 6, 6, 4<<20) o.genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, limit12B) o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B) o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B) o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20) o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B) o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B) o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B) diff --git a/s2/encode_amd64.go b/s2/encode_amd64.go index c3fc8d1ee3..e5b47a7a03 100644 --- a/s2/encode_amd64.go +++ b/s2/encode_amd64.go @@ -21,9 +21,12 @@ func encodeBlock(dst, src []byte) (d int) { limit8B = 512 ) - if len(src) >= limit12B { + if len(src) >= 4<<20 { return encodeBlockAsm(dst, src) } + if len(src) >= limit12B { + return encodeBlockAsm4MB(dst, src) + } if len(src) >= limit10B { return encodeBlockAsm12B(dst, src) } @@ -53,9 +56,12 @@ func encodeBlockBetter(dst, src []byte) (d int) { limit8B = 512 ) - if len(src) >= limit12B { + if len(src) > 4<<20 { return encodeBetterBlockAsm(dst, src) } + if len(src) >= limit12B { + return encodeBetterBlockAsm4MB(dst, src) + } if len(src) >= limit10B { return encodeBetterBlockAsm12B(dst, src) } diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index cb04bdd295..9ab3c7ae74 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -13,6 +13,13 @@ package s2 //go:noescape func encodeBlockAsm(dst []byte, src []byte) int +// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4194304 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBlockAsm4MB(dst []byte, src []byte) int + // encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 16383 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. @@ -41,6 +48,13 @@ func encodeBlockAsm8B(dst []byte, src []byte) int //go:noescape func encodeBetterBlockAsm(dst []byte, src []byte) int +// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4194304 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeBetterBlockAsm4MB(dst []byte, src []byte) int + // encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 16383 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 99f45d0c05..918f57f9e8 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -1218,15 +1218,15 @@ emit_literal_done_emit_remainder_encodeBlockAsm: MOVQ AX, ret+48(FP) RET -// func encodeBlockAsm12B(dst []byte, src []byte) int +// func encodeBlockAsm4MB(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBlockAsm12B(SB), $16408-56 +TEXT ·encodeBlockAsm4MB(SB), $65560-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX + MOVQ $0x00000200, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBlockAsm12B: +zero_loop_encodeBlockAsm4MB: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -1237,7 +1237,7 @@ zero_loop_encodeBlockAsm12B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBlockAsm12B + JNZ zero_loop_encodeBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -1251,25 +1251,25 @@ zero_loop_encodeBlockAsm12B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBlockAsm12B: +search_loop_encodeBlockAsm4MB: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x05, BP + SHRL $0x06, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBlockAsm12B + JGE emit_remainder_encodeBlockAsm4MB MOVL BP, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 - SHLQ $0x18, R9 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x18, R10 + SHRQ $0x32, R9 + SHLQ $0x10, R10 IMULQ R8, R10 - SHRQ $0x34, R10 + SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BP MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) @@ -1277,115 +1277,126 @@ search_loop_encodeBlockAsm12B: MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 - SHLQ $0x18, R9 + SHLQ $0x10, R9 IMULQ R8, R9 - SHRQ $0x34, R9 + SHRQ $0x32, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm12B + JNE no_repeat_found_encodeBlockAsm4MB LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BP SUBL 16(SP), BP - JZ repeat_extend_back_end_encodeBlockAsm12B + JZ repeat_extend_back_end_encodeBlockAsm4MB -repeat_extend_back_loop_encodeBlockAsm12B: +repeat_extend_back_loop_encodeBlockAsm4MB: CMPL SI, DI - JLE repeat_extend_back_end_encodeBlockAsm12B + JLE repeat_extend_back_end_encodeBlockAsm4MB MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBlockAsm12B + JNE repeat_extend_back_end_encodeBlockAsm4MB LEAL -1(SI), SI DECL BP - JNZ repeat_extend_back_loop_encodeBlockAsm12B + JNZ repeat_extend_back_loop_encodeBlockAsm4MB -repeat_extend_back_end_encodeBlockAsm12B: +repeat_extend_back_end_encodeBlockAsm4MB: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B + JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 SUBL BP, R8 LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm12B + JLT one_byte_repeat_emit_encodeBlockAsm4MB CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm12B + JLT two_bytes_repeat_emit_encodeBlockAsm4MB + CMPL BP, $0x00010000 + JLT three_bytes_repeat_emit_encodeBlockAsm4MB + MOVL BP, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBlockAsm4MB + +three_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm12B + JMP memmove_long_repeat_emit_encodeBlockAsm4MB -two_bytes_repeat_emit_encodeBlockAsm12B: +two_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBlockAsm12B - JMP memmove_long_repeat_emit_encodeBlockAsm12B + JL memmove_repeat_emit_encodeBlockAsm4MB + JMP memmove_long_repeat_emit_encodeBlockAsm4MB -one_byte_repeat_emit_encodeBlockAsm12B: +one_byte_repeat_emit_encodeBlockAsm4MB: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBlockAsm12B: +memmove_repeat_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), BP // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2: MOVB (R9), R10 MOVB -1(R9)(R8*1), R9 MOVB R10, (AX) MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3: MOVW (R9), R10 MOVB 2(R9), R9 MOVW R10, (AX) MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -1395,11 +1406,11 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeBlockAsm12B: +memmove_end_copy_repeat_emit_encodeBlockAsm4MB: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm12B + JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB -memmove_long_repeat_emit_encodeBlockAsm12B: +memmove_long_repeat_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), BP // genMemMoveLong @@ -1414,11 +1425,11 @@ memmove_long_repeat_emit_encodeBlockAsm12B: MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) @@ -1427,23 +1438,23 @@ emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBlockAsm12B: +emit_literal_done_repeat_emit_encodeBlockAsm4MB: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -1455,78 +1466,90 @@ emit_literal_done_repeat_emit_encodeBlockAsm12B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm12B + JL matchlen_single_repeat_extend_encodeBlockAsm4MB -matchlen_loopback_repeat_extend_encodeBlockAsm12B: +matchlen_loopback_repeat_extend_encodeBlockAsm4MB: MOVQ (R9)(R11*1), R10 XORQ (BP)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm12B + JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm12B + JMP repeat_extend_forward_end_encodeBlockAsm4MB -matchlen_loop_repeat_extend_encodeBlockAsm12B: +matchlen_loop_repeat_extend_encodeBlockAsm4MB: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B + JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB -matchlen_single_repeat_extend_encodeBlockAsm12B: +matchlen_single_repeat_extend_encodeBlockAsm4MB: TESTL R8, R8 - JZ repeat_extend_forward_end_encodeBlockAsm12B + JZ repeat_extend_forward_end_encodeBlockAsm4MB -matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: +matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: MOVB (R9)(R11*1), R10 CMPB (BP)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm12B + JNE repeat_extend_forward_end_encodeBlockAsm4MB LEAL 1(R11), R11 DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB -repeat_extend_forward_end_encodeBlockAsm12B: +repeat_extend_forward_end_encodeBlockAsm4MB: ADDL R11, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm12B + JZ repeat_as_copy_encodeBlockAsm4MB // emitRepeat MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm12B + JLE repeat_two_match_repeat_encodeBlockAsm4MB CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB CMPL SI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm12B + JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB -cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: +cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: CMPL BP, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm12B + JLT repeat_three_match_repeat_encodeBlockAsm4MB + CMPL BP, $0x00010100 + JLT repeat_four_match_repeat_encodeBlockAsm4MB + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_match_repeat_encodeBlockAsm4MB: LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_three_match_repeat_encodeBlockAsm12B: +repeat_three_match_repeat_encodeBlockAsm4MB: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_match_repeat_encodeBlockAsm12B: +repeat_two_match_repeat_encodeBlockAsm4MB: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_offset_match_repeat_encodeBlockAsm12B: +repeat_two_offset_match_repeat_encodeBlockAsm4MB: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -1535,13 +1558,93 @@ repeat_two_offset_match_repeat_encodeBlockAsm12B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_as_copy_encodeBlockAsm12B: +repeat_as_copy_encodeBlockAsm4MB: // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm12B: + CMPL SI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B + JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(BP), BP + ADDQ $0x05, AX + CMPL BP, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB + + // emitRepeat + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB + +four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: + TESTL BP, BP + JZ repeat_end_emit_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP @@ -1551,36 +1654,48 @@ two_byte_offset_repeat_as_copy_encodeBlockAsm12B: MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -1589,14 +1704,14 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB -two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: +two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -1605,152 +1720,163 @@ two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B + JMP repeat_end_emit_encodeBlockAsm4MB -emit_copy_three_repeat_as_copy_encodeBlockAsm12B: +emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBlockAsm12B: +repeat_end_emit_encodeBlockAsm4MB: MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm12B + JMP search_loop_encodeBlockAsm4MB -no_repeat_found_encodeBlockAsm12B: +no_repeat_found_encodeBlockAsm4MB: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBlockAsm12B + JEQ candidate_match_encodeBlockAsm4MB SHRQ $0x08, SI MOVL 24(SP)(R9*4), BP LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm12B + JEQ candidate2_match_encodeBlockAsm4MB MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeBlockAsm12B + JEQ candidate3_match_encodeBlockAsm4MB MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm12B + JMP search_loop_encodeBlockAsm4MB -candidate3_match_encodeBlockAsm12B: +candidate3_match_encodeBlockAsm4MB: ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm12B + JMP candidate_match_encodeBlockAsm4MB -candidate2_match_encodeBlockAsm12B: +candidate2_match_encodeBlockAsm4MB: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BP -candidate_match_encodeBlockAsm12B: +candidate_match_encodeBlockAsm4MB: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBlockAsm12B + JZ match_extend_back_end_encodeBlockAsm4MB -match_extend_back_loop_encodeBlockAsm12B: +match_extend_back_loop_encodeBlockAsm4MB: CMPL CX, SI - JLE match_extend_back_end_encodeBlockAsm12B + JLE match_extend_back_end_encodeBlockAsm4MB MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBlockAsm12B + JNE match_extend_back_end_encodeBlockAsm4MB LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBlockAsm12B - JMP match_extend_back_loop_encodeBlockAsm12B + JZ match_extend_back_end_encodeBlockAsm4MB + JMP match_extend_back_loop_encodeBlockAsm4MB -match_extend_back_end_encodeBlockAsm12B: +match_extend_back_end_encodeBlockAsm4MB: MOVL CX, SI SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI + LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBlockAsm12B + JL match_dst_size_check_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBlockAsm12B: +match_dst_size_check_encodeBlockAsm4MB: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm12B + JEQ emit_literal_done_match_emit_encodeBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBlockAsm12B + JLT one_byte_match_emit_encodeBlockAsm4MB CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm12B + JLT two_bytes_match_emit_encodeBlockAsm4MB + CMPL DI, $0x00010000 + JLT three_bytes_match_emit_encodeBlockAsm4MB + MOVL DI, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW DI, 1(AX) + MOVB R9, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBlockAsm4MB + +three_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm12B + JMP memmove_long_match_emit_encodeBlockAsm4MB -two_bytes_match_emit_encodeBlockAsm12B: +two_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBlockAsm12B - JMP memmove_long_match_emit_encodeBlockAsm12B + JL memmove_match_emit_encodeBlockAsm4MB + JMP memmove_long_match_emit_encodeBlockAsm4MB -one_byte_match_emit_encodeBlockAsm12B: +one_byte_match_emit_encodeBlockAsm4MB: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBlockAsm12B: +memmove_match_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B + JMP memmove_end_copy_match_emit_encodeBlockAsm4MB -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -1760,11 +1886,11 @@ emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBlockAsm12B: +memmove_end_copy_match_emit_encodeBlockAsm4MB: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm12B + JMP emit_literal_done_match_emit_encodeBlockAsm4MB -memmove_long_match_emit_encodeBlockAsm12B: +memmove_long_match_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), DI // genMemMoveLong @@ -1779,11 +1905,11 @@ memmove_long_match_emit_encodeBlockAsm12B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) @@ -1792,24 +1918,24 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBlockAsm12B: -match_nolit_loop_encodeBlockAsm12B: +emit_literal_done_match_emit_encodeBlockAsm4MB: +match_nolit_loop_encodeBlockAsm4MB: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -1823,46 +1949,126 @@ match_nolit_loop_encodeBlockAsm12B: // matchLen XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm12B + JL matchlen_single_match_nolit_encodeBlockAsm4MB -matchlen_loopback_match_nolit_encodeBlockAsm12B: +matchlen_loopback_match_nolit_encodeBlockAsm4MB: MOVQ (DI)(R9*1), R8 XORQ (BP)(R9*1), R8 TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm12B + JZ matchlen_loop_match_nolit_encodeBlockAsm4MB BSFQ R8, R8 SARQ $0x03, R8 LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm12B + JMP match_nolit_end_encodeBlockAsm4MB -matchlen_loop_match_nolit_encodeBlockAsm12B: +matchlen_loop_match_nolit_encodeBlockAsm4MB: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm12B + JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB -matchlen_single_match_nolit_encodeBlockAsm12B: +matchlen_single_match_nolit_encodeBlockAsm4MB: TESTL SI, SI - JZ match_nolit_end_encodeBlockAsm12B + JZ match_nolit_end_encodeBlockAsm4MB -matchlen_single_loopback_match_nolit_encodeBlockAsm12B: +matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: MOVB (DI)(R9*1), R8 CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm12B + JNE match_nolit_end_encodeBlockAsm4MB LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB -match_nolit_end_encodeBlockAsm12B: +match_nolit_end_encodeBlockAsm4MB: ADDL R9, CX MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm12B: + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBlockAsm4MB + +four_bytes_loop_back_match_nolit_encodeBlockAsm4MB: CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B + JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB + MOVB $0xff, (AX) + MOVL BP, 1(AX) + LEAL -64(R9), R9 + ADDQ $0x05, AX + CMPL R9, $0x04 + JL four_bytes_remain_match_nolit_encodeBlockAsm4MB + + // emitRepeat + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + CMPL R9, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy + CMPL R9, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy + LEAL -65536(R9), R9 + MOVL R9, BP + MOVW $0x001d, (AX) + MOVW R9, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -256(R9), R9 + MOVW $0x0019, (AX) + MOVW R9, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: + LEAL -4(R9), R9 + MOVW $0x0015, (AX) + MOVB R9, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB + +four_bytes_remain_match_nolit_encodeBlockAsm4MB: + TESTL R9, R9 + JZ match_nolit_emitcopy_end_encodeBlockAsm4MB + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +two_byte_offset_match_nolit_encodeBlockAsm4MB: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB MOVB $0xee, (AX) MOVW BP, 1(AX) LEAL -60(R9), R9 @@ -1872,36 +2078,48 @@ two_byte_offset_match_nolit_encodeBlockAsm12B: MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL BP, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: CMPL R9, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short + CMPL R9, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short + LEAL -65536(R9), R9 + MOVL R9, BP + MOVW $0x001d, (AX) + MOVW R9, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) @@ -1910,14 +2128,14 @@ repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBlockAsm4MB -two_byte_offset_short_match_nolit_encodeBlockAsm12B: +two_byte_offset_short_match_nolit_encodeBlockAsm4MB: CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm12B + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm12B + JGE emit_copy_three_match_nolit_encodeBlockAsm4MB MOVB $0x01, BL LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) @@ -1926,138 +2144,149 @@ two_byte_offset_short_match_nolit_encodeBlockAsm12B: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB -emit_copy_three_match_nolit_encodeBlockAsm12B: +emit_copy_three_match_nolit_encodeBlockAsm4MB: MOVB $0x02, BL LEAL -4(BX)(R9*4), R9 MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBlockAsm12B: +match_nolit_emitcopy_end_encodeBlockAsm4MB: CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm12B + JGE emit_remainder_encodeBlockAsm4MB MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm12B + JL match_nolit_dst_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R8 +match_nolit_dst_ok_encodeBlockAsm4MB: + MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BP - SHLQ $0x18, DI + SHLQ $0x10, DI IMULQ R8, DI - SHRQ $0x34, DI - SHLQ $0x18, BP + SHRQ $0x32, DI + SHLQ $0x10, BP IMULQ R8, BP - SHRQ $0x34, BP + SHRQ $0x32, BP LEAL -2(CX), R8 LEAQ 24(SP)(BP*4), R9 MOVL (R9), BP MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeBlockAsm12B + JEQ match_nolit_loop_encodeBlockAsm4MB INCL CX - JMP search_loop_encodeBlockAsm12B + JMP search_loop_encodeBlockAsm4MB -emit_remainder_encodeBlockAsm12B: +emit_remainder_encodeBlockAsm4MB: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX + LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm12B + JL emit_remainder_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBlockAsm12B: +emit_remainder_ok_encodeBlockAsm4MB: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B + JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm12B + JLT one_byte_emit_remainder_encodeBlockAsm4MB CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm12B + JLT two_bytes_emit_remainder_encodeBlockAsm4MB + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBlockAsm4MB + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBlockAsm4MB + +three_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm12B + JMP memmove_long_emit_remainder_encodeBlockAsm4MB -two_bytes_emit_remainder_encodeBlockAsm12B: +two_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm12B - JMP memmove_long_emit_remainder_encodeBlockAsm12B + JL memmove_emit_remainder_encodeBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBlockAsm4MB -one_byte_emit_remainder_encodeBlockAsm12B: +one_byte_emit_remainder_encodeBlockAsm4MB: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBlockAsm12B: +memmove_emit_remainder_encodeBlockAsm4MB: LEAQ (AX)(BP*1), DX MOVL BP, BX // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -2067,11 +2296,11 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBlockAsm12B: +memmove_end_copy_emit_remainder_encodeBlockAsm4MB: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm12B + JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB -memmove_long_emit_remainder_encodeBlockAsm12B: +memmove_long_emit_remainder_encodeBlockAsm4MB: LEAQ (AX)(BP*1), DX MOVL BP, BX @@ -2087,11 +2316,11 @@ memmove_long_emit_remainder_encodeBlockAsm12B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOA X4, (R8) @@ -2100,37 +2329,37 @@ emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: ADDQ $0x20, BP ADDQ $0x20, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBlockAsm12B: +emit_literal_done_emit_remainder_encodeBlockAsm4MB: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBlockAsm10B(dst []byte, src []byte) int +// func encodeBlockAsm12B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBlockAsm10B(SB), $4120-56 +TEXT ·encodeBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX + MOVQ $0x00000080, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBlockAsm10B: +zero_loop_encodeBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -2141,7 +2370,7 @@ zero_loop_encodeBlockAsm10B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBlockAsm10B + JNZ zero_loop_encodeBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -2155,25 +2384,25 @@ zero_loop_encodeBlockAsm10B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBlockAsm10B: +search_loop_encodeBlockAsm12B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBlockAsm10B + JGE emit_remainder_encodeBlockAsm12B MOVL BP, 20(SP) - MOVQ $0x9e3779b1, R8 + MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 - SHLQ $0x20, R9 + SHLQ $0x18, R9 IMULQ R8, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 + SHRQ $0x34, R9 + SHLQ $0x18, R10 IMULQ R8, R10 - SHRQ $0x36, R10 + SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BP MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) @@ -2181,115 +2410,115 @@ search_loop_encodeBlockAsm10B: MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 - SHLQ $0x20, R9 + SHLQ $0x18, R9 IMULQ R8, R9 - SHRQ $0x36, R9 + SHRQ $0x34, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm10B + JNE no_repeat_found_encodeBlockAsm12B LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BP SUBL 16(SP), BP - JZ repeat_extend_back_end_encodeBlockAsm10B + JZ repeat_extend_back_end_encodeBlockAsm12B -repeat_extend_back_loop_encodeBlockAsm10B: +repeat_extend_back_loop_encodeBlockAsm12B: CMPL SI, DI - JLE repeat_extend_back_end_encodeBlockAsm10B + JLE repeat_extend_back_end_encodeBlockAsm12B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBlockAsm10B + JNE repeat_extend_back_end_encodeBlockAsm12B LEAL -1(SI), SI DECL BP - JNZ repeat_extend_back_loop_encodeBlockAsm10B + JNZ repeat_extend_back_loop_encodeBlockAsm12B -repeat_extend_back_end_encodeBlockAsm10B: +repeat_extend_back_end_encodeBlockAsm12B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B + JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 SUBL BP, R8 LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm10B + JLT one_byte_repeat_emit_encodeBlockAsm12B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm10B + JLT two_bytes_repeat_emit_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm10B + JMP memmove_long_repeat_emit_encodeBlockAsm12B -two_bytes_repeat_emit_encodeBlockAsm10B: +two_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBlockAsm10B - JMP memmove_long_repeat_emit_encodeBlockAsm10B + JL memmove_repeat_emit_encodeBlockAsm12B + JMP memmove_long_repeat_emit_encodeBlockAsm12B -one_byte_repeat_emit_encodeBlockAsm10B: +one_byte_repeat_emit_encodeBlockAsm12B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBlockAsm10B: +memmove_repeat_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), BP // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: MOVB (R9), R10 MOVB -1(R9)(R8*1), R9 MOVB R10, (AX) MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: MOVW (R9), R10 MOVB 2(R9), R9 MOVW R10, (AX) MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -2299,11 +2528,11 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeBlockAsm10B: +memmove_end_copy_repeat_emit_encodeBlockAsm12B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm10B + JMP emit_literal_done_repeat_emit_encodeBlockAsm12B -memmove_long_repeat_emit_encodeBlockAsm10B: +memmove_long_repeat_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), BP // genMemMoveLong @@ -2318,11 +2547,11 @@ memmove_long_repeat_emit_encodeBlockAsm10B: MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) @@ -2331,23 +2560,23 @@ emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBlockAsm10B: +emit_literal_done_repeat_emit_encodeBlockAsm12B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -2359,78 +2588,78 @@ emit_literal_done_repeat_emit_encodeBlockAsm10B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm10B + JL matchlen_single_repeat_extend_encodeBlockAsm12B -matchlen_loopback_repeat_extend_encodeBlockAsm10B: +matchlen_loopback_repeat_extend_encodeBlockAsm12B: MOVQ (R9)(R11*1), R10 XORQ (BP)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm10B + JZ matchlen_loop_repeat_extend_encodeBlockAsm12B BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm10B + JMP repeat_extend_forward_end_encodeBlockAsm12B -matchlen_loop_repeat_extend_encodeBlockAsm10B: +matchlen_loop_repeat_extend_encodeBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B + JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B -matchlen_single_repeat_extend_encodeBlockAsm10B: +matchlen_single_repeat_extend_encodeBlockAsm12B: TESTL R8, R8 - JZ repeat_extend_forward_end_encodeBlockAsm10B + JZ repeat_extend_forward_end_encodeBlockAsm12B -matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: +matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: MOVB (R9)(R11*1), R10 CMPB (BP)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm10B + JNE repeat_extend_forward_end_encodeBlockAsm12B LEAL 1(R11), R11 DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B -repeat_extend_forward_end_encodeBlockAsm10B: +repeat_extend_forward_end_encodeBlockAsm12B: ADDL R11, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm10B + JZ repeat_as_copy_encodeBlockAsm12B // emitRepeat MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm10B + JLE repeat_two_match_repeat_encodeBlockAsm12B CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B CMPL SI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBlockAsm10B + JLT repeat_two_offset_match_repeat_encodeBlockAsm12B -cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: +cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: CMPL BP, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm10B + JLT repeat_three_match_repeat_encodeBlockAsm12B LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_three_match_repeat_encodeBlockAsm10B: +repeat_three_match_repeat_encodeBlockAsm12B: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_match_repeat_encodeBlockAsm10B: +repeat_two_match_repeat_encodeBlockAsm12B: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_offset_match_repeat_encodeBlockAsm10B: +repeat_two_offset_match_repeat_encodeBlockAsm12B: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -2439,13 +2668,13 @@ repeat_two_offset_match_repeat_encodeBlockAsm10B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_as_copy_encodeBlockAsm10B: +repeat_as_copy_encodeBlockAsm12B: // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm10B: +two_byte_offset_repeat_as_copy_encodeBlockAsm12B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP @@ -2455,36 +2684,36 @@ two_byte_offset_repeat_as_copy_encodeBlockAsm10B: MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL SI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -2493,14 +2722,14 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B -two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: +two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -2509,152 +2738,152 @@ two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B + JMP repeat_end_emit_encodeBlockAsm12B -emit_copy_three_repeat_as_copy_encodeBlockAsm10B: +emit_copy_three_repeat_as_copy_encodeBlockAsm12B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBlockAsm10B: +repeat_end_emit_encodeBlockAsm12B: MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm10B + JMP search_loop_encodeBlockAsm12B -no_repeat_found_encodeBlockAsm10B: +no_repeat_found_encodeBlockAsm12B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBlockAsm10B + JEQ candidate_match_encodeBlockAsm12B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BP LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm10B + JEQ candidate2_match_encodeBlockAsm12B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeBlockAsm10B + JEQ candidate3_match_encodeBlockAsm12B MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm10B + JMP search_loop_encodeBlockAsm12B -candidate3_match_encodeBlockAsm10B: +candidate3_match_encodeBlockAsm12B: ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm10B + JMP candidate_match_encodeBlockAsm12B -candidate2_match_encodeBlockAsm10B: +candidate2_match_encodeBlockAsm12B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BP -candidate_match_encodeBlockAsm10B: +candidate_match_encodeBlockAsm12B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBlockAsm10B + JZ match_extend_back_end_encodeBlockAsm12B -match_extend_back_loop_encodeBlockAsm10B: +match_extend_back_loop_encodeBlockAsm12B: CMPL CX, SI - JLE match_extend_back_end_encodeBlockAsm10B + JLE match_extend_back_end_encodeBlockAsm12B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBlockAsm10B + JNE match_extend_back_end_encodeBlockAsm12B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBlockAsm10B - JMP match_extend_back_loop_encodeBlockAsm10B + JZ match_extend_back_end_encodeBlockAsm12B + JMP match_extend_back_loop_encodeBlockAsm12B -match_extend_back_end_encodeBlockAsm10B: +match_extend_back_end_encodeBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBlockAsm10B + JL match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBlockAsm10B: +match_dst_size_check_encodeBlockAsm12B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm10B + JEQ emit_literal_done_match_emit_encodeBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBlockAsm10B + JLT one_byte_match_emit_encodeBlockAsm12B CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm10B + JLT two_bytes_match_emit_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm10B + JMP memmove_long_match_emit_encodeBlockAsm12B -two_bytes_match_emit_encodeBlockAsm10B: +two_bytes_match_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBlockAsm10B - JMP memmove_long_match_emit_encodeBlockAsm10B + JL memmove_match_emit_encodeBlockAsm12B + JMP memmove_long_match_emit_encodeBlockAsm12B -one_byte_match_emit_encodeBlockAsm10B: +one_byte_match_emit_encodeBlockAsm12B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBlockAsm10B: +memmove_match_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B + JMP memmove_end_copy_match_emit_encodeBlockAsm12B -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -2664,11 +2893,11 @@ emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBlockAsm10B: +memmove_end_copy_match_emit_encodeBlockAsm12B: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm10B + JMP emit_literal_done_match_emit_encodeBlockAsm12B -memmove_long_match_emit_encodeBlockAsm10B: +memmove_long_match_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), DI // genMemMoveLong @@ -2683,11 +2912,11 @@ memmove_long_match_emit_encodeBlockAsm10B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) @@ -2696,24 +2925,24 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBlockAsm10B: -match_nolit_loop_encodeBlockAsm10B: +emit_literal_done_match_emit_encodeBlockAsm12B: +match_nolit_loop_encodeBlockAsm12B: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -2727,46 +2956,46 @@ match_nolit_loop_encodeBlockAsm10B: // matchLen XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm10B + JL matchlen_single_match_nolit_encodeBlockAsm12B -matchlen_loopback_match_nolit_encodeBlockAsm10B: +matchlen_loopback_match_nolit_encodeBlockAsm12B: MOVQ (DI)(R9*1), R8 XORQ (BP)(R9*1), R8 TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm10B + JZ matchlen_loop_match_nolit_encodeBlockAsm12B BSFQ R8, R8 SARQ $0x03, R8 LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm10B + JMP match_nolit_end_encodeBlockAsm12B -matchlen_loop_match_nolit_encodeBlockAsm10B: +matchlen_loop_match_nolit_encodeBlockAsm12B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm10B + JGE matchlen_loopback_match_nolit_encodeBlockAsm12B -matchlen_single_match_nolit_encodeBlockAsm10B: +matchlen_single_match_nolit_encodeBlockAsm12B: TESTL SI, SI - JZ match_nolit_end_encodeBlockAsm10B + JZ match_nolit_end_encodeBlockAsm12B -matchlen_single_loopback_match_nolit_encodeBlockAsm10B: +matchlen_single_loopback_match_nolit_encodeBlockAsm12B: MOVB (DI)(R9*1), R8 CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm10B + JNE match_nolit_end_encodeBlockAsm12B LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B -match_nolit_end_encodeBlockAsm10B: +match_nolit_end_encodeBlockAsm12B: ADDL R9, CX MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm10B: +two_byte_offset_match_nolit_encodeBlockAsm12B: CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B + JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B MOVB $0xee, (AX) MOVW BP, 1(AX) LEAL -60(R9), R9 @@ -2776,36 +3005,36 @@ two_byte_offset_match_nolit_encodeBlockAsm10B: MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL BP, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: CMPL R9, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) @@ -2814,14 +3043,14 @@ repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBlockAsm12B -two_byte_offset_short_match_nolit_encodeBlockAsm10B: +two_byte_offset_short_match_nolit_encodeBlockAsm12B: CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm10B + JGE emit_copy_three_match_nolit_encodeBlockAsm12B CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBlockAsm10B + JGE emit_copy_three_match_nolit_encodeBlockAsm12B MOVB $0x01, BL LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) @@ -2830,138 +3059,138 @@ two_byte_offset_short_match_nolit_encodeBlockAsm10B: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP match_nolit_emitcopy_end_encodeBlockAsm12B -emit_copy_three_match_nolit_encodeBlockAsm10B: +emit_copy_three_match_nolit_encodeBlockAsm12B: MOVB $0x02, BL LEAL -4(BX)(R9*4), R9 MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBlockAsm10B: +match_nolit_emitcopy_end_encodeBlockAsm12B: CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm10B + JGE emit_remainder_encodeBlockAsm12B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm10B + JL match_nolit_dst_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ $0x9e3779b1, R8 +match_nolit_dst_ok_encodeBlockAsm12B: + MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BP - SHLQ $0x20, DI + SHLQ $0x18, DI IMULQ R8, DI - SHRQ $0x36, DI - SHLQ $0x20, BP + SHRQ $0x34, DI + SHLQ $0x18, BP IMULQ R8, BP - SHRQ $0x36, BP + SHRQ $0x34, BP LEAL -2(CX), R8 LEAQ 24(SP)(BP*4), R9 MOVL (R9), BP MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeBlockAsm10B + JEQ match_nolit_loop_encodeBlockAsm12B INCL CX - JMP search_loop_encodeBlockAsm10B + JMP search_loop_encodeBlockAsm12B -emit_remainder_encodeBlockAsm10B: +emit_remainder_encodeBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm10B + JL emit_remainder_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBlockAsm10B: +emit_remainder_ok_encodeBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B + JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm10B + JLT one_byte_emit_remainder_encodeBlockAsm12B CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm10B + JLT two_bytes_emit_remainder_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm10B + JMP memmove_long_emit_remainder_encodeBlockAsm12B -two_bytes_emit_remainder_encodeBlockAsm10B: +two_bytes_emit_remainder_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm10B - JMP memmove_long_emit_remainder_encodeBlockAsm10B + JL memmove_emit_remainder_encodeBlockAsm12B + JMP memmove_long_emit_remainder_encodeBlockAsm12B -one_byte_emit_remainder_encodeBlockAsm10B: +one_byte_emit_remainder_encodeBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBlockAsm10B: +memmove_emit_remainder_encodeBlockAsm12B: LEAQ (AX)(BP*1), DX MOVL BP, BX // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -2971,11 +3200,11 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBlockAsm10B: +memmove_end_copy_emit_remainder_encodeBlockAsm12B: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm10B + JMP emit_literal_done_emit_remainder_encodeBlockAsm12B -memmove_long_emit_remainder_encodeBlockAsm10B: +memmove_long_emit_remainder_encodeBlockAsm12B: LEAQ (AX)(BP*1), DX MOVL BP, BX @@ -2991,11 +3220,11 @@ memmove_long_emit_remainder_encodeBlockAsm10B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOA X4, (R8) @@ -3004,37 +3233,37 @@ emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: ADDQ $0x20, BP ADDQ $0x20, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBlockAsm10B: +emit_literal_done_emit_remainder_encodeBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBlockAsm8B(dst []byte, src []byte) int +// func encodeBlockAsm10B(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBlockAsm8B(SB), $1048-56 +TEXT ·encodeBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX + MOVQ $0x00000020, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBlockAsm8B: +zero_loop_encodeBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -3045,7 +3274,7 @@ zero_loop_encodeBlockAsm8B: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBlockAsm8B + JNZ zero_loop_encodeBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX @@ -3059,14 +3288,14 @@ zero_loop_encodeBlockAsm8B: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBlockAsm8B: +search_loop_encodeBlockAsm10B: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP - SHRL $0x04, BP + SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBlockAsm8B + JGE emit_remainder_encodeBlockAsm10B MOVL BP, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 @@ -3074,10 +3303,10 @@ search_loop_encodeBlockAsm8B: SHRQ $0x08, R10 SHLQ $0x20, R9 IMULQ R8, R9 - SHRQ $0x38, R9 + SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ R8, R10 - SHRQ $0x38, R10 + SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BP MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) @@ -3087,113 +3316,113 @@ search_loop_encodeBlockAsm8B: SHRQ $0x10, R9 SHLQ $0x20, R9 IMULQ R8, R9 - SHRQ $0x38, R9 + SHRQ $0x36, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm8B + JNE no_repeat_found_encodeBlockAsm10B LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BP SUBL 16(SP), BP - JZ repeat_extend_back_end_encodeBlockAsm8B + JZ repeat_extend_back_end_encodeBlockAsm10B -repeat_extend_back_loop_encodeBlockAsm8B: +repeat_extend_back_loop_encodeBlockAsm10B: CMPL SI, DI - JLE repeat_extend_back_end_encodeBlockAsm8B + JLE repeat_extend_back_end_encodeBlockAsm10B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBlockAsm8B + JNE repeat_extend_back_end_encodeBlockAsm10B LEAL -1(SI), SI DECL BP - JNZ repeat_extend_back_loop_encodeBlockAsm8B + JNZ repeat_extend_back_loop_encodeBlockAsm10B -repeat_extend_back_end_encodeBlockAsm8B: +repeat_extend_back_end_encodeBlockAsm10B: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B + JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 SUBL BP, R8 LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBlockAsm8B + JLT one_byte_repeat_emit_encodeBlockAsm10B CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBlockAsm8B + JLT two_bytes_repeat_emit_encodeBlockAsm10B MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm8B + JMP memmove_long_repeat_emit_encodeBlockAsm10B -two_bytes_repeat_emit_encodeBlockAsm8B: +two_bytes_repeat_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBlockAsm8B - JMP memmove_long_repeat_emit_encodeBlockAsm8B + JL memmove_repeat_emit_encodeBlockAsm10B + JMP memmove_long_repeat_emit_encodeBlockAsm10B -one_byte_repeat_emit_encodeBlockAsm8B: +one_byte_repeat_emit_encodeBlockAsm10B: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBlockAsm8B: +memmove_repeat_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), BP // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2: MOVB (R9), R10 MOVB -1(R9)(R8*1), R9 MOVB R10, (AX) MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3: MOVW (R9), R10 MOVB 2(R9), R9 MOVW R10, (AX) MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -3203,11 +3432,11 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeBlockAsm8B: +memmove_end_copy_repeat_emit_encodeBlockAsm10B: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm8B + JMP emit_literal_done_repeat_emit_encodeBlockAsm10B -memmove_long_repeat_emit_encodeBlockAsm8B: +memmove_long_repeat_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), BP // genMemMoveLong @@ -3222,11 +3451,11 @@ memmove_long_repeat_emit_encodeBlockAsm8B: MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) @@ -3235,23 +3464,23 @@ emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBlockAsm8B: +emit_literal_done_repeat_emit_encodeBlockAsm10B: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -3263,74 +3492,78 @@ emit_literal_done_repeat_emit_encodeBlockAsm8B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm8B + JL matchlen_single_repeat_extend_encodeBlockAsm10B -matchlen_loopback_repeat_extend_encodeBlockAsm8B: +matchlen_loopback_repeat_extend_encodeBlockAsm10B: MOVQ (R9)(R11*1), R10 XORQ (BP)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm8B + JZ matchlen_loop_repeat_extend_encodeBlockAsm10B BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm8B + JMP repeat_extend_forward_end_encodeBlockAsm10B -matchlen_loop_repeat_extend_encodeBlockAsm8B: +matchlen_loop_repeat_extend_encodeBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B + JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B -matchlen_single_repeat_extend_encodeBlockAsm8B: +matchlen_single_repeat_extend_encodeBlockAsm10B: TESTL R8, R8 - JZ repeat_extend_forward_end_encodeBlockAsm8B + JZ repeat_extend_forward_end_encodeBlockAsm10B -matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: +matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: MOVB (R9)(R11*1), R10 CMPB (BP)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm8B + JNE repeat_extend_forward_end_encodeBlockAsm10B LEAL 1(R11), R11 DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B -repeat_extend_forward_end_encodeBlockAsm8B: +repeat_extend_forward_end_encodeBlockAsm10B: ADDL R11, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm8B + JZ repeat_as_copy_encodeBlockAsm10B // emitRepeat - MOVL BP, SI + MOVL BP, DI LEAL -4(BP), BP - CMPL SI, $0x08 - JLE repeat_two_match_repeat_encodeBlockAsm8B - CMPL SI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm10B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBlockAsm10B -cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: +cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: CMPL BP, $0x00000104 - JLT repeat_three_match_repeat_encodeBlockAsm8B + JLT repeat_three_match_repeat_encodeBlockAsm10B LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_three_match_repeat_encodeBlockAsm8B: +repeat_three_match_repeat_encodeBlockAsm10B: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_two_match_repeat_encodeBlockAsm8B: +repeat_two_match_repeat_encodeBlockAsm10B: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_match_repeat_encodeBlockAsm10B: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -3339,48 +3572,52 @@ repeat_two_match_repeat_encodeBlockAsm8B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_as_copy_encodeBlockAsm8B: +repeat_as_copy_encodeBlockAsm10B: // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm8B: +two_byte_offset_repeat_as_copy_encodeBlockAsm10B: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX // emitRepeat - MOVL BP, SI + MOVL BP, DI LEAL -4(BP), BP - CMPL SI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: +repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: +repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -3389,12 +3626,14 @@ repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B -two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: +two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -3403,152 +3642,152 @@ two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B + JMP repeat_end_emit_encodeBlockAsm10B -emit_copy_three_repeat_as_copy_encodeBlockAsm8B: +emit_copy_three_repeat_as_copy_encodeBlockAsm10B: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBlockAsm8B: +repeat_end_emit_encodeBlockAsm10B: MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm8B + JMP search_loop_encodeBlockAsm10B -no_repeat_found_encodeBlockAsm8B: +no_repeat_found_encodeBlockAsm10B: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBlockAsm8B + JEQ candidate_match_encodeBlockAsm10B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BP LEAL 2(CX), R8 CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm8B + JEQ candidate2_match_encodeBlockAsm10B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BP*1), SI - JEQ candidate3_match_encodeBlockAsm8B + JEQ candidate3_match_encodeBlockAsm10B MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm8B + JMP search_loop_encodeBlockAsm10B -candidate3_match_encodeBlockAsm8B: +candidate3_match_encodeBlockAsm10B: ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm8B + JMP candidate_match_encodeBlockAsm10B -candidate2_match_encodeBlockAsm8B: +candidate2_match_encodeBlockAsm10B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BP -candidate_match_encodeBlockAsm8B: +candidate_match_encodeBlockAsm10B: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBlockAsm8B + JZ match_extend_back_end_encodeBlockAsm10B -match_extend_back_loop_encodeBlockAsm8B: +match_extend_back_loop_encodeBlockAsm10B: CMPL CX, SI - JLE match_extend_back_end_encodeBlockAsm8B + JLE match_extend_back_end_encodeBlockAsm10B MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBlockAsm8B + JNE match_extend_back_end_encodeBlockAsm10B LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBlockAsm8B - JMP match_extend_back_loop_encodeBlockAsm8B + JZ match_extend_back_end_encodeBlockAsm10B + JMP match_extend_back_loop_encodeBlockAsm10B -match_extend_back_end_encodeBlockAsm8B: +match_extend_back_end_encodeBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBlockAsm8B + JL match_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBlockAsm8B: +match_dst_size_check_encodeBlockAsm10B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm8B + JEQ emit_literal_done_match_emit_encodeBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c - JLT one_byte_match_emit_encodeBlockAsm8B + JLT one_byte_match_emit_encodeBlockAsm10B CMPL DI, $0x00000100 - JLT two_bytes_match_emit_encodeBlockAsm8B + JLT two_bytes_match_emit_encodeBlockAsm10B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm8B + JMP memmove_long_match_emit_encodeBlockAsm10B -two_bytes_match_emit_encodeBlockAsm8B: +two_bytes_match_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 - JL memmove_match_emit_encodeBlockAsm8B - JMP memmove_long_match_emit_encodeBlockAsm8B + JL memmove_match_emit_encodeBlockAsm10B + JMP memmove_long_match_emit_encodeBlockAsm10B -one_byte_match_emit_encodeBlockAsm8B: +one_byte_match_emit_encodeBlockAsm10B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBlockAsm8B: +memmove_match_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2: MOVB (SI), R9 MOVB -1(SI)(R8*1), SI MOVB R9, (AX) MOVB SI, -1(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3: MOVW (SI), R9 MOVB 2(SI), SI MOVW R9, (AX) MOVB SI, 2(AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7: MOVL (SI), R9 MOVL -4(SI)(R8*1), SI MOVL R9, (AX) MOVL SI, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B + JMP memmove_end_copy_match_emit_encodeBlockAsm10B -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 @@ -3558,11 +3797,11 @@ emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_match_emit_encodeBlockAsm8B: +memmove_end_copy_match_emit_encodeBlockAsm10B: MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm8B + JMP emit_literal_done_match_emit_encodeBlockAsm10B -memmove_long_match_emit_encodeBlockAsm8B: +memmove_long_match_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), DI // genMemMoveLong @@ -3577,11 +3816,11 @@ memmove_long_match_emit_encodeBlockAsm8B: MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) @@ -3590,24 +3829,24 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX -emit_literal_done_match_emit_encodeBlockAsm8B: -match_nolit_loop_encodeBlockAsm8B: +emit_literal_done_match_emit_encodeBlockAsm10B: +match_nolit_loop_encodeBlockAsm10B: MOVL CX, SI SUBL BP, SI MOVL SI, 16(SP) @@ -3621,81 +3860,85 @@ match_nolit_loop_encodeBlockAsm8B: // matchLen XORL R9, R9 CMPL SI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm8B + JL matchlen_single_match_nolit_encodeBlockAsm10B -matchlen_loopback_match_nolit_encodeBlockAsm8B: +matchlen_loopback_match_nolit_encodeBlockAsm10B: MOVQ (DI)(R9*1), R8 XORQ (BP)(R9*1), R8 TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm8B + JZ matchlen_loop_match_nolit_encodeBlockAsm10B BSFQ R8, R8 SARQ $0x03, R8 LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm8B + JMP match_nolit_end_encodeBlockAsm10B -matchlen_loop_match_nolit_encodeBlockAsm8B: +matchlen_loop_match_nolit_encodeBlockAsm10B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBlockAsm8B + JGE matchlen_loopback_match_nolit_encodeBlockAsm10B -matchlen_single_match_nolit_encodeBlockAsm8B: +matchlen_single_match_nolit_encodeBlockAsm10B: TESTL SI, SI - JZ match_nolit_end_encodeBlockAsm8B + JZ match_nolit_end_encodeBlockAsm10B -matchlen_single_loopback_match_nolit_encodeBlockAsm8B: +matchlen_single_loopback_match_nolit_encodeBlockAsm10B: MOVB (DI)(R9*1), R8 CMPB (BP)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm8B + JNE match_nolit_end_encodeBlockAsm10B LEAL 1(R9), R9 DECL SI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B -match_nolit_end_encodeBlockAsm8B: +match_nolit_end_encodeBlockAsm10B: ADDL R9, CX MOVL 16(SP), BP ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm8B: +two_byte_offset_match_nolit_encodeBlockAsm10B: CMPL R9, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B MOVB $0xee, (AX) MOVW BP, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX // emitRepeat - MOVL R9, BP + MOVL R9, SI LEAL -4(R9), R9 - CMPL BP, $0x08 - JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL BP, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: +cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: CMPL R9, $0x00000104 - JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short + JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBlockAsm10B -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: +repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBlockAsm10B -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: +repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) @@ -3704,12 +3947,14 @@ repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBlockAsm8B: + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBlockAsm10B + +two_byte_offset_short_match_nolit_encodeBlockAsm10B: CMPL R9, $0x0c - JGE emit_copy_three_match_nolit_encodeBlockAsm8B + JGE emit_copy_three_match_nolit_encodeBlockAsm10B + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBlockAsm10B MOVB $0x01, BL LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) @@ -3718,138 +3963,2249 @@ two_byte_offset_short_match_nolit_encodeBlockAsm8B: ORL BP, R9 MOVB R9, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP match_nolit_emitcopy_end_encodeBlockAsm10B -emit_copy_three_match_nolit_encodeBlockAsm8B: +emit_copy_three_match_nolit_encodeBlockAsm10B: MOVB $0x02, BL LEAL -4(BX)(R9*4), R9 MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBlockAsm8B: +match_nolit_emitcopy_end_encodeBlockAsm10B: CMPL CX, 8(SP) - JGE emit_remainder_encodeBlockAsm8B + JGE emit_remainder_encodeBlockAsm10B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBlockAsm8B + JL match_nolit_dst_ok_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBlockAsm8B: +match_nolit_dst_ok_encodeBlockAsm10B: MOVQ $0x9e3779b1, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BP SHLQ $0x20, DI IMULQ R8, DI - SHRQ $0x38, DI + SHRQ $0x36, DI SHLQ $0x20, BP IMULQ R8, BP - SHRQ $0x38, BP + SHRQ $0x36, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeBlockAsm10B + INCL CX + JMP search_loop_encodeBlockAsm10B + +emit_remainder_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm10B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm10B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm10B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm10B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +two_bytes_emit_remainder_encodeBlockAsm10B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm10B + JMP memmove_long_emit_remainder_encodeBlockAsm10B + +one_byte_emit_remainder_encodeBlockAsm10B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm10B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm10B + +memmove_long_emit_remainder_encodeBlockAsm10B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm10B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBlockAsm8B(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBlockAsm8B(SB), $1048-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000008, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBlockAsm8B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBlockAsm8B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -5(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBlockAsm8B: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 4(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 + SHRQ $0x08, R10 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 + JNE no_repeat_found_encodeBlockAsm8B + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBlockAsm8B + +repeat_extend_back_loop_encodeBlockAsm8B: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBlockAsm8B + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBlockAsm8B + +repeat_extend_back_end_encodeBlockAsm8B: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBlockAsm8B + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +two_bytes_repeat_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBlockAsm8B + JMP memmove_long_repeat_emit_encodeBlockAsm8B + +one_byte_repeat_emit_encodeBlockAsm8B: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBlockAsm8B: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBlockAsm8B + +memmove_long_repeat_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBlockAsm8B: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBlockAsm8B: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBlockAsm8B + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_loop_repeat_extend_encodeBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B + +matchlen_single_repeat_extend_encodeBlockAsm8B: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBlockAsm8B + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B + +repeat_extend_forward_end_encodeBlockAsm8B: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBlockAsm8B + + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_match_repeat_encodeBlockAsm8B + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B + +cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBlockAsm8B + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_match_repeat_encodeBlockAsm8B: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_match_repeat_encodeBlockAsm8B: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_as_copy_encodeBlockAsm8B: + // emitCopy +two_byte_offset_repeat_as_copy_encodeBlockAsm8B: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat + MOVL BP, SI + LEAL -4(BP), BP + CMPL SI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + CMPL SI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B + +two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +emit_copy_three_repeat_as_copy_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeBlockAsm8B + +no_repeat_found_encodeBlockAsm8B: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBlockAsm8B + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI + JEQ candidate2_match_encodeBlockAsm8B + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI + JEQ candidate3_match_encodeBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBlockAsm8B + +candidate3_match_encodeBlockAsm8B: + ADDL $0x02, CX + JMP candidate_match_encodeBlockAsm8B + +candidate2_match_encodeBlockAsm8B: + MOVL R8, 24(SP)(R9*4) + INCL CX + MOVL DI, BP + +candidate_match_encodeBlockAsm8B: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBlockAsm8B + +match_extend_back_loop_encodeBlockAsm8B: + CMPL CX, SI + JLE match_extend_back_end_encodeBlockAsm8B + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBlockAsm8B + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBlockAsm8B + JMP match_extend_back_loop_encodeBlockAsm8B + +match_extend_back_end_encodeBlockAsm8B: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 3(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBlockAsm8B: + MOVL CX, SI + MOVL 12(SP), DI + CMPL DI, SI + JEQ emit_literal_done_match_emit_encodeBlockAsm8B + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + LEAL -1(R8), DI + CMPL DI, $0x3c + JLT one_byte_match_emit_encodeBlockAsm8B + CMPL DI, $0x00000100 + JLT two_bytes_match_emit_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW DI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBlockAsm8B + +two_bytes_match_emit_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DI, 1(AX) + ADDQ $0x02, AX + CMPL DI, $0x40 + JL memmove_match_emit_encodeBlockAsm8B + JMP memmove_long_match_emit_encodeBlockAsm8B + +one_byte_match_emit_encodeBlockAsm8B: + SHLB $0x02, DI + MOVB DI, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), DI + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2: + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3: + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7: + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeBlockAsm8B + +emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_match_emit_encodeBlockAsm8B: + MOVQ DI, AX + JMP emit_literal_done_match_emit_encodeBlockAsm8B + +memmove_long_match_emit_encodeBlockAsm8B: + LEAQ (AX)(R8*1), DI + + // genMemMoveLong + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 + MOVQ R8, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(SI)(R11*1), R9 + LEAQ -32(AX)(R11*1), R12 + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + ADDQ $0x20, R12 + ADDQ $0x20, R9 + ADDQ $0x20, R11 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(SI)(R11*1), X4 + MOVOU -16(SI)(R11*1), X5 + MOVOA X4, -32(AX)(R11*1) + MOVOA X5, -16(AX)(R11*1) + ADDQ $0x20, R11 + CMPQ R8, R11 + JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ DI, AX + +emit_literal_done_match_emit_encodeBlockAsm8B: +match_nolit_loop_encodeBlockAsm8B: + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R9, R9 + CMPL SI, $0x08 + JL matchlen_single_match_nolit_encodeBlockAsm8B + +matchlen_loopback_match_nolit_encodeBlockAsm8B: + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 + JZ matchlen_loop_match_nolit_encodeBlockAsm8B + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 + JMP match_nolit_end_encodeBlockAsm8B + +matchlen_loop_match_nolit_encodeBlockAsm8B: + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBlockAsm8B + +matchlen_single_match_nolit_encodeBlockAsm8B: + TESTL SI, SI + JZ match_nolit_end_encodeBlockAsm8B + +matchlen_single_loopback_match_nolit_encodeBlockAsm8B: + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 + JNE match_nolit_end_encodeBlockAsm8B + LEAL 1(R9), R9 + DECL SI + JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B + +match_nolit_end_encodeBlockAsm8B: + ADDL R9, CX + MOVL 16(SP), BP + ADDL $0x04, R9 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm8B: + CMPL R9, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R9), R9 + ADDQ $0x03, AX + + // emitRepeat + MOVL R9, BP + LEAL -4(R9), R9 + CMPL BP, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short + CMPL BP, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: + CMPL R9, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short + LEAL -256(R9), R9 + MOVW $0x0019, (AX) + MOVW R9, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: + LEAL -4(R9), R9 + MOVW $0x0015, (AX) + MOVB R9, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBlockAsm8B + +two_byte_offset_short_match_nolit_encodeBlockAsm8B: + CMPL R9, $0x0c + JGE emit_copy_three_match_nolit_encodeBlockAsm8B + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R9 + MOVB R9, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +emit_copy_three_match_nolit_encodeBlockAsm8B: + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBlockAsm8B: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBlockAsm8B + MOVQ -2(DX)(CX*1), SI + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBlockAsm8B: + MOVQ $0x9e3779b1, R8 + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, BP + SHLQ $0x20, DI + IMULQ R8, DI + SHRQ $0x38, DI + SHLQ $0x20, BP + IMULQ R8, BP + SHRQ $0x38, BP + LEAL -2(CX), R8 + LEAQ 24(SP)(BP*4), R9 + MOVL (R9), BP + MOVL R8, 24(SP)(DI*4) + MOVL CX, (R9) + CMPL (DX)(BP*1), SI + JEQ match_nolit_loop_encodeBlockAsm8B + INCL CX + JMP search_loop_encodeBlockAsm8B + +emit_remainder_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + SUBL 12(SP), CX + LEAQ 3(AX)(CX*1), CX + CMPQ CX, (SP) + JL emit_remainder_ok_encodeBlockAsm8B + MOVQ $0x00000000, ret+48(FP) + RET + +emit_remainder_ok_encodeBlockAsm8B: + MOVQ src_len+32(FP), CX + MOVL 12(SP), BX + CMPL BX, CX + JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B + MOVL CX, BP + MOVL CX, 12(SP) + LEAQ (DX)(BX*1), CX + SUBL BX, BP + LEAL -1(BP), DX + CMPL DX, $0x3c + JLT one_byte_emit_remainder_encodeBlockAsm8B + CMPL DX, $0x00000100 + JLT two_bytes_emit_remainder_encodeBlockAsm8B + MOVB $0xf4, (AX) + MOVW DX, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +two_bytes_emit_remainder_encodeBlockAsm8B: + MOVB $0xf0, (AX) + MOVB DL, 1(AX) + ADDQ $0x02, AX + CMPL DX, $0x40 + JL memmove_emit_remainder_encodeBlockAsm8B + JMP memmove_long_emit_remainder_encodeBlockAsm8B + +one_byte_emit_remainder_encodeBlockAsm8B: + SHLB $0x02, DL + MOVB DL, (AX) + ADDQ $0x01, AX + +memmove_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 + CMPQ BX, $0x08 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 + CMPQ BX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 + CMPQ BX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: + MOVB (CX), BP + MOVB -1(CX)(BX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: + MOVW (CX), BP + MOVB 2(CX), CL + MOVW BP, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: + MOVL (CX), BP + MOVL -4(CX)(BX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: + MOVQ (CX), BP + MOVQ -8(CX)(BX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: + MOVOU (CX), X0 + MOVOU -16(CX)(BX*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + +memmove_end_copy_emit_remainder_encodeBlockAsm8B: + MOVQ DX, AX + JMP emit_literal_done_emit_remainder_encodeBlockAsm8B + +memmove_long_emit_remainder_encodeBlockAsm8B: + LEAQ (AX)(BP*1), DX + MOVL BP, BX + + // genMemMoveLong + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(BX*1), X2 + MOVOU -16(CX)(BX*1), X3 + MOVQ BX, SI + SHRQ $0x05, SI + MOVQ AX, BP + ANDL $0x0000001f, BP + MOVQ $0x00000040, DI + SUBQ BP, DI + DECQ SI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(CX)(DI*1), BP + LEAQ -32(AX)(DI*1), R8 + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: + MOVOU (BP), X4 + MOVOU 16(BP), X5 + MOVOA X4, (R8) + MOVOA X5, 16(R8) + ADDQ $0x20, R8 + ADDQ $0x20, BP + ADDQ $0x20, DI + DECQ SI + JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(CX)(DI*1), X4 + MOVOU -16(CX)(DI*1), X5 + MOVOA X4, -32(AX)(DI*1) + MOVOA X5, -16(AX)(DI*1) + ADDQ $0x20, DI + CMPQ BX, DI + JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(BX*1) + MOVOU X3, -16(AX)(BX*1) + MOVQ DX, AX + +emit_literal_done_emit_remainder_encodeBlockAsm8B: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ AX, ret+48(FP) + RET + +// func encodeBetterBlockAsm(dst []byte, src []byte) int +// Requires: SSE2 +TEXT ·encodeBetterBlockAsm(SB), $327704-56 + MOVQ dst_base+0(FP), AX + MOVQ $0x00000a00, CX + LEAQ 24(SP), DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), BP + MOVL BP, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL CX, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm: + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x07, BP + LEAL 1(CX)(BP*1), BP + CMPL BP, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + MOVL BP, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R8 + MOVQ $0x9e3779b1, BP + MOVQ SI, R9 + MOVQ SI, R10 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ BP, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 262168(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + MOVL CX, 262168(SP)(R10*4) + MOVL CX, R9 + SUBL 16(SP), R9 + MOVL 1(DX)(R9*1), R10 + MOVQ SI, R9 + SHRQ $0x08, R9 + CMPL R9, R10 + JNE no_repeat_found_encodeBetterBlockAsm + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP + JZ repeat_extend_back_end_encodeBetterBlockAsm + +repeat_extend_back_loop_encodeBetterBlockAsm: + CMPL SI, DI + JLE repeat_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 + JNE repeat_extend_back_end_encodeBetterBlockAsm + LEAL -1(SI), SI + DECL BP + JNZ repeat_extend_back_loop_encodeBetterBlockAsm + +repeat_extend_back_end_encodeBetterBlockAsm: + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R9 + SUBL BP, R8 + LEAL -1(R8), BP + CMPL BP, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x00010000 + JLT three_bytes_repeat_emit_encodeBetterBlockAsm + CMPL BP, $0x01000000 + JLT four_bytes_repeat_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +four_bytes_repeat_emit_encodeBetterBlockAsm: + MOVL BP, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +three_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +two_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +one_byte_repeat_emit_encodeBetterBlockAsm: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), BP + + // genMemMoveShort + CMPQ R8, $0x03 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (R9), R10 + MOVB 2(R9), R9 + MOVW R10, (AX) + MOVB R9, 2(AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm: + MOVQ BP, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm + +memmove_long_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), BP + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ BP, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm: + ADDL $0x05, CX + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_single_repeat_extend_encodeBetterBlockAsm + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm: + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_loop_repeat_extend_encodeBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm + +matchlen_single_repeat_extend_encodeBetterBlockAsm: + TESTL R8, R8 + JZ repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm: + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm + LEAL 1(R11), R11 + DECL R8 + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm + +repeat_extend_forward_end_encodeBetterBlockAsm: + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI + JZ repeat_as_copy_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_match_repeat_encodeBetterBlockAsm: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm: + CMPL BP, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm + CMPL BP, $0x00010100 + JLT repeat_four_match_repeat_encodeBetterBlockAsm + CMPL BP, $0x0100ffff + JLT repeat_five_match_repeat_encodeBetterBlockAsm + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_repeat_encodeBetterBlockAsm + +repeat_five_match_repeat_encodeBetterBlockAsm: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_match_repeat_encodeBetterBlockAsm: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_match_repeat_encodeBetterBlockAsm: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_match_repeat_encodeBetterBlockAsm: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_match_repeat_encodeBetterBlockAsm: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_as_copy_encodeBetterBlockAsm: + // emitCopy + CMPL SI, $0x00010000 + JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL SI, 1(AX) + LEAL -64(BP), BP + ADDQ $0x05, AX + CMPL BP, $0x04 + JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy + +repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm + +four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm: + TESTL BP, BP + JZ repeat_end_emit_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x40 + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW SI, 1(AX) + LEAL -60(BP), BP + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + MOVL BP, DI + LEAL -4(BP), BP + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(BP), BP + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + +repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(BP), BP + MOVL BP, SI + MOVW $0x001d, (AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(BP), BP + MOVW $0x0019, (AX) + MOVW BP, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(BP), BP + MOVW $0x0015, (AX) + MOVB BP, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: + CMPL BP, $0x0c + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + CMPL SI, $0x00000800 + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + +repeat_end_emit_encodeBetterBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm + +no_repeat_found_encodeBetterBlockAsm: + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm + CMPL (DX)(DI*1), SI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm + +candidateS_match_encodeBetterBlockAsm: + SHRQ $0x08, SI + MOVQ SI, R9 + SHLQ $0x08, R9 + IMULQ R8, R9 + SHRQ $0x30, R9 + MOVL 24(SP)(R9*4), BP + INCL CX + MOVL CX, 24(SP)(R9*4) + CMPL (DX)(BP*1), SI + JEQ candidate_match_encodeBetterBlockAsm + DECL CX + MOVL DI, BP + +candidate_match_encodeBetterBlockAsm: + MOVL 12(SP), SI + TESTL BP, BP + JZ match_extend_back_end_encodeBetterBlockAsm + +match_extend_back_loop_encodeBetterBlockAsm: + CMPL CX, SI + JLE match_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(CX*1), DI + CMPB BL, DI + JNE match_extend_back_end_encodeBetterBlockAsm + LEAL -1(CX), CX + DECL BP + JZ match_extend_back_end_encodeBetterBlockAsm + JMP match_extend_back_loop_encodeBetterBlockAsm + +match_extend_back_end_encodeBetterBlockAsm: + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 5(AX)(SI*1), SI + CMPQ SI, (SP) + JL match_dst_size_check_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_dst_size_check_encodeBetterBlockAsm: + MOVL CX, SI + ADDL $0x04, CX + ADDL $0x04, BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), R9 + + // matchLen + XORL R11, R11 + CMPL DI, $0x08 + JL matchlen_single_match_nolit_encodeBetterBlockAsm + +matchlen_loopback_match_nolit_encodeBetterBlockAsm: + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm + +matchlen_loop_match_nolit_encodeBetterBlockAsm: + LEAL -8(DI), DI + LEAL 8(R11), R11 + CMPL DI, $0x08 + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + +matchlen_single_match_nolit_encodeBetterBlockAsm: + TESTL DI, DI + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 + JNE match_nolit_end_encodeBetterBlockAsm + LEAL 1(R11), R11 + DECL DI + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + +match_nolit_end_encodeBetterBlockAsm: + MOVL CX, DI + SUBL BP, DI + CMPL R11, $0x01 + JG match_length_ok_encodeBetterBlockAsm + CMPL DI, $0x0000ffff + JLE match_length_ok_encodeBetterBlockAsm + MOVL 20(SP), CX + INCL CX + JMP search_loop_encodeBetterBlockAsm + +match_length_ok_encodeBetterBlockAsm: + MOVL DI, 16(SP) + MOVL 12(SP), BP + CMPL BP, SI + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + LEAL -1(DI), BP + CMPL BP, $0x3c + JLT one_byte_match_emit_encodeBetterBlockAsm + CMPL BP, $0x00000100 + JLT two_bytes_match_emit_encodeBetterBlockAsm + CMPL BP, $0x00010000 + JLT three_bytes_match_emit_encodeBetterBlockAsm + CMPL BP, $0x01000000 + JLT four_bytes_match_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +four_bytes_match_emit_encodeBetterBlockAsm: + MOVL BP, R9 + SHRL $0x10, R9 + MOVB $0xf8, (AX) + MOVW BP, 1(AX) + MOVB R9, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +three_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_match_emit_encodeBetterBlockAsm + +two_bytes_match_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB BP, 1(AX) + ADDQ $0x02, AX + CMPL BP, $0x40 + JL memmove_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm + +one_byte_match_emit_encodeBetterBlockAsm: + SHLB $0x02, BP + MOVB BP, (AX) + ADDQ $0x01, AX + +memmove_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP + + // genMemMoveShort + CMPQ DI, $0x03 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3 + CMPQ DI, $0x08 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ DI, $0x10 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ DI, $0x20 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3: + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(DI*1) + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + +emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + +memmove_end_copy_match_emit_encodeBetterBlockAsm: + MOVQ BP, AX + JMP emit_literal_done_match_emit_encodeBetterBlockAsm + +memmove_long_match_emit_encodeBetterBlockAsm: + LEAQ (AX)(DI*1), BP + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ AX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) + MOVQ BP, AX + +emit_literal_done_match_emit_encodeBetterBlockAsm: + ADDL R11, CX + MOVL 16(SP), BP + ADDL $0x04, R11 + MOVL CX, 12(SP) + + // emitCopy + CMPL BP, $0x00010000 + JL two_byte_offset_match_nolit_encodeBetterBlockAsm + +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x40 + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm + MOVB $0xff, (AX) + MOVL BP, 1(AX) + LEAL -64(R11), R11 + ADDQ $0x05, AX + CMPL R11, $0x04 + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy + CMPL R11, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy + LEAL -16842747(R11), R11 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + +four_bytes_remain_match_nolit_encodeBetterBlockAsm: + TESTL R11, R11 + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm + MOVB $0x03, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVL BP, 1(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +two_byte_offset_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + MOVB $0xee, (AX) + MOVW BP, 1(AX) + LEAL -60(R11), R11 + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: + MOVL R11, DI + LEAL -4(R11), R11 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL BP, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + CMPL R11, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R11, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short + CMPL R11, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short + LEAL -16842747(R11), R11 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -65536(R11), R11 + MOVL R11, BP + MOVW $0x001d, (AX) + MOVW R11, 2(AX) + SARL $0x10, BP + MOVB BP, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -256(R11), R11 + MOVW $0x0019, (AX) + MOVW R11, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: + LEAL -4(R11), R11 + MOVW $0x0015, (AX) + MOVB R11, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: + SHLL $0x02, R11 + ORL $0x01, R11 + MOVW R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: + XORQ DI, DI + LEAL 1(DI)(R11*4), R11 + MOVB BP, 1(AX) + SARL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + +two_byte_offset_short_match_nolit_encodeBetterBlockAsm: + CMPL R11, $0x0c + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + CMPL BP, $0x00000800 + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + MOVB $0x01, BL + LEAL -16(BX)(R11*4), R11 + MOVB BP, 1(AX) + SHRL $0x08, BP + SHLL $0x05, BP + ORL BP, R11 + MOVB R11, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +emit_copy_three_match_nolit_encodeBetterBlockAsm: + MOVB $0x02, BL + LEAL -4(BX)(R11*4), R11 + MOVB R11, (AX) + MOVW BP, 1(AX) + ADDQ $0x03, AX + +match_nolit_emitcopy_end_encodeBetterBlockAsm: + CMPL CX, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm + CMPQ AX, (SP) + JL match_nolit_dst_ok_encodeBetterBlockAsm + MOVQ $0x00000000, ret+48(FP) + RET + +match_nolit_dst_ok_encodeBetterBlockAsm: + MOVQ $0x00cf1bbcdcbfa563, BP + MOVQ $0x9e3779b1, DI + INCL SI + MOVQ (DX)(SI*1), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 + LEAL 1(SI), R11 + MOVQ -2(DX)(CX*1), R8 + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL SI, 24(SP)(R9*4) + MOVL R11, 262168(SP)(R10*4) + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x08, R10 LEAL -2(CX), R8 - LEAQ 24(SP)(BP*4), R9 - MOVL (R9), BP - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BP*1), SI - JEQ match_nolit_loop_encodeBlockAsm8B - INCL CX - JMP search_loop_encodeBlockAsm8B + LEAL -1(CX), SI + SHLQ $0x08, R9 + IMULQ BP, R9 + SHRQ $0x30, R9 + SHLQ $0x20, R10 + IMULQ DI, R10 + SHRQ $0x32, R10 + MOVL R8, 24(SP)(R9*4) + MOVL SI, 262168(SP)(R10*4) + JMP search_loop_encodeBetterBlockAsm -emit_remainder_encodeBlockAsm8B: +emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX + LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBlockAsm8B + JL emit_remainder_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBlockAsm8B: +emit_remainder_ok_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBlockAsm8B + JLT one_byte_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBlockAsm8B + JLT two_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x00010000 + JLT three_bytes_emit_remainder_encodeBetterBlockAsm + CMPL DX, $0x01000000 + JLT four_bytes_emit_remainder_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL DX, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +four_bytes_emit_remainder_encodeBetterBlockAsm: + MOVL DX, BX + SHRL $0x10, BX + MOVB $0xf8, (AX) + MOVW DX, 1(AX) + MOVB BL, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_emit_remainder_encodeBetterBlockAsm + +three_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm8B + JMP memmove_long_emit_remainder_encodeBetterBlockAsm -two_bytes_emit_remainder_encodeBlockAsm8B: +two_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBlockAsm8B - JMP memmove_long_emit_remainder_encodeBlockAsm8B + JL memmove_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm -one_byte_emit_remainder_encodeBlockAsm8B: +one_byte_emit_remainder_encodeBetterBlockAsm: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBlockAsm8B: +memmove_emit_remainder_encodeBetterBlockAsm: LEAQ (AX)(BP*1), DX MOVL BP, BX // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -3859,11 +6215,11 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBlockAsm8B: +memmove_end_copy_emit_remainder_encodeBetterBlockAsm: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm8B + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm -memmove_long_emit_remainder_encodeBlockAsm8B: +memmove_long_emit_remainder_encodeBetterBlockAsm: LEAQ (AX)(BP*1), DX MOVL BP, BX @@ -3879,11 +6235,11 @@ memmove_long_emit_remainder_encodeBlockAsm8B: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOA X4, (R8) @@ -3892,37 +6248,37 @@ emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: ADDQ $0x20, BP ADDQ $0x20, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBlockAsm8B: +emit_literal_done_emit_remainder_encodeBetterBlockAsm: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET -// func encodeBetterBlockAsm(dst []byte, src []byte) int +// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int // Requires: SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 +TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBetterBlockAsm: +zero_loop_encodeBetterBlockAsm4MB: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) @@ -3933,7 +6289,7 @@ zero_loop_encodeBetterBlockAsm: MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX - JNZ zero_loop_encodeBetterBlockAsm + JNZ zero_loop_encodeBetterBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX @@ -3947,14 +6303,14 @@ zero_loop_encodeBetterBlockAsm: MOVL CX, 16(SP) MOVQ src_base+24(FP), DX -search_loop_encodeBetterBlockAsm: +search_loop_encodeBetterBlockAsm4MB: MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x07, BP LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm + JGE emit_remainder_encodeBetterBlockAsm4MB MOVL BP, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R8 MOVQ $0x9e3779b1, BP @@ -3976,125 +6332,117 @@ search_loop_encodeBetterBlockAsm: MOVQ SI, R9 SHRQ $0x08, R9 CMPL R9, R10 - JNE no_repeat_found_encodeBetterBlockAsm + JNE no_repeat_found_encodeBetterBlockAsm4MB LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BP SUBL 16(SP), BP - JZ repeat_extend_back_end_encodeBetterBlockAsm + JZ repeat_extend_back_end_encodeBetterBlockAsm4MB -repeat_extend_back_loop_encodeBetterBlockAsm: +repeat_extend_back_loop_encodeBetterBlockAsm4MB: CMPL SI, DI - JLE repeat_extend_back_end_encodeBetterBlockAsm + JLE repeat_extend_back_end_encodeBetterBlockAsm4MB MOVB -1(DX)(BP*1), BL MOVB -1(DX)(SI*1), R8 CMPB BL, R8 - JNE repeat_extend_back_end_encodeBetterBlockAsm + JNE repeat_extend_back_end_encodeBetterBlockAsm4MB LEAL -1(SI), SI DECL BP - JNZ repeat_extend_back_loop_encodeBetterBlockAsm + JNZ repeat_extend_back_loop_encodeBetterBlockAsm4MB -repeat_extend_back_end_encodeBetterBlockAsm: +repeat_extend_back_end_encodeBetterBlockAsm4MB: MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 SUBL BP, R8 LEAL -1(R8), BP CMPL BP, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm + JLT one_byte_repeat_emit_encodeBetterBlockAsm4MB CMPL BP, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm + JLT two_bytes_repeat_emit_encodeBetterBlockAsm4MB CMPL BP, $0x00010000 - JLT three_bytes_repeat_emit_encodeBetterBlockAsm - CMPL BP, $0x01000000 - JLT four_bytes_repeat_emit_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL BP, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm - -four_bytes_repeat_emit_encodeBetterBlockAsm: + JLT three_bytes_repeat_emit_encodeBetterBlockAsm4MB MOVL BP, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BP, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB -three_bytes_repeat_emit_encodeBetterBlockAsm: +three_bytes_repeat_emit_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB -two_bytes_repeat_emit_encodeBetterBlockAsm: +two_bytes_repeat_emit_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm - JMP memmove_long_repeat_emit_encodeBetterBlockAsm + JL memmove_repeat_emit_encodeBetterBlockAsm4MB + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB -one_byte_repeat_emit_encodeBetterBlockAsm: +one_byte_repeat_emit_encodeBetterBlockAsm4MB: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_repeat_emit_encodeBetterBlockAsm: +memmove_repeat_emit_encodeBetterBlockAsm4MB: LEAQ (AX)(R8*1), BP // genMemMoveShort CMPQ R8, $0x03 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_1or2: +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_1or2: MOVB (R9), R10 MOVB -1(R9)(R8*1), R9 MOVB R10, (AX) MOVB R9, -1(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_3: +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_3: MOVW (R9), R10 MOVB 2(R9), R9 MOVW R10, (AX) MOVB R9, 2(AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 @@ -4104,11 +6452,11 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) -memmove_end_copy_repeat_emit_encodeBetterBlockAsm: +memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB: MOVQ BP, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB -memmove_long_repeat_emit_encodeBetterBlockAsm: +memmove_long_repeat_emit_encodeBetterBlockAsm4MB: LEAQ (AX)(R8*1), BP // genMemMoveLong @@ -4123,11 +6471,11 @@ memmove_long_repeat_emit_encodeBetterBlockAsm: MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) @@ -4136,23 +6484,23 @@ emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BP, AX -emit_literal_done_repeat_emit_encodeBetterBlockAsm: +emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB: ADDL $0x05, CX MOVL CX, BP SUBL 16(SP), BP @@ -4164,70 +6512,59 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeBetterBlockAsm + JL matchlen_single_repeat_extend_encodeBetterBlockAsm4MB -matchlen_loopback_repeat_extend_encodeBetterBlockAsm: +matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB: MOVQ (R9)(R11*1), R10 XORQ (BP)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBetterBlockAsm + JMP repeat_extend_forward_end_encodeBetterBlockAsm4MB -matchlen_loop_repeat_extend_encodeBetterBlockAsm: +matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB -matchlen_single_repeat_extend_encodeBetterBlockAsm: +matchlen_single_repeat_extend_encodeBetterBlockAsm4MB: TESTL R8, R8 - JZ repeat_extend_forward_end_encodeBetterBlockAsm + JZ repeat_extend_forward_end_encodeBetterBlockAsm4MB -matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm: +matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm4MB: MOVB (R9)(R11*1), R10 CMPB (BP)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBetterBlockAsm + JNE repeat_extend_forward_end_encodeBetterBlockAsm4MB LEAL 1(R11), R11 DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm + JNZ matchlen_single_loopback_repeat_extend_encodeBetterBlockAsm4MB -repeat_extend_forward_end_encodeBetterBlockAsm: +repeat_extend_forward_end_encodeBetterBlockAsm4MB: ADDL R11, CX MOVL CX, BP SUBL SI, BP MOVL 16(SP), SI TESTL DI, DI - JZ repeat_as_copy_encodeBetterBlockAsm + JZ repeat_as_copy_encodeBetterBlockAsm4MB // emitRepeat -emit_repeat_again_match_repeat_encodeBetterBlockAsm: MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBetterBlockAsm + JLE repeat_two_match_repeat_encodeBetterBlockAsm4MB CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB CMPL SI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB -cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm: +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: CMPL BP, $0x00000104 - JLT repeat_three_match_repeat_encodeBetterBlockAsm + JLT repeat_three_match_repeat_encodeBetterBlockAsm4MB CMPL BP, $0x00010100 - JLT repeat_four_match_repeat_encodeBetterBlockAsm - CMPL BP, $0x0100ffff - JLT repeat_five_match_repeat_encodeBetterBlockAsm - LEAL -16842747(BP), BP - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_repeat_encodeBetterBlockAsm - -repeat_five_match_repeat_encodeBetterBlockAsm: + JLT repeat_four_match_repeat_encodeBetterBlockAsm4MB LEAL -65536(BP), BP MOVL BP, SI MOVW $0x001d, (AX) @@ -4235,30 +6572,30 @@ repeat_five_match_repeat_encodeBetterBlockAsm: SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_four_match_repeat_encodeBetterBlockAsm: +repeat_four_match_repeat_encodeBetterBlockAsm4MB: LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_three_match_repeat_encodeBetterBlockAsm: +repeat_three_match_repeat_encodeBetterBlockAsm4MB: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_two_match_repeat_encodeBetterBlockAsm: +repeat_two_match_repeat_encodeBetterBlockAsm4MB: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_two_offset_match_repeat_encodeBetterBlockAsm: +repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -4267,49 +6604,38 @@ repeat_two_offset_match_repeat_encodeBetterBlockAsm: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_as_copy_encodeBetterBlockAsm: +repeat_as_copy_encodeBetterBlockAsm4MB: // emitCopy CMPL SI, $0x00010000 - JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + JL two_byte_offset_repeat_as_copy_encodeBetterBlockAsm4MB -four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm: +four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm4MB: CMPL BP, $0x40 - JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + JLE four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm4MB MOVB $0xff, (AX) MOVL SI, 1(AX) LEAL -64(BP), BP ADDQ $0x05, AX CMPL BP, $0x04 - JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm + JL four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm4MB // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy: MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy - CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy - CMPL SI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: - CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy - CMPL BP, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy - CMPL BP, $0x0100ffff - JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy - LEAL -16842747(BP), BP - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + CMPL SI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy -repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy: +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: + CMPL BP, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy + CMPL BP, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy LEAL -65536(BP), BP MOVL BP, SI MOVW $0x001d, (AX) @@ -4317,30 +6643,30 @@ repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy: SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy: +repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy: +repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy: +repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -4349,53 +6675,42 @@ repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB + JMP four_bytes_loop_back_repeat_as_copy_encodeBetterBlockAsm4MB -four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm: +four_bytes_remain_repeat_as_copy_encodeBetterBlockAsm4MB: TESTL BP, BP - JZ repeat_end_emit_encodeBetterBlockAsm + JZ repeat_end_emit_encodeBetterBlockAsm4MB MOVB $0x03, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVL SI, 1(AX) ADDQ $0x05, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -two_byte_offset_repeat_as_copy_encodeBetterBlockAsm: +two_byte_offset_repeat_as_copy_encodeBetterBlockAsm4MB: CMPL BP, $0x40 - JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm + JLE two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm4MB MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BP), BP ADDQ $0x03, AX // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: MOVL BP, DI LEAL -4(BP), BP CMPL DI, $0x08 - JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + JLE repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short CMPL DI, $0x0c - JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + JGE cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short CMPL SI, $0x00000800 - JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + JLT repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short -cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: +cant_repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: CMPL BP, $0x00000104 - JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short + JLT repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short CMPL BP, $0x00010100 - JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short - CMPL BP, $0x0100ffff - JLT repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short - LEAL -16842747(BP), BP - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short - -repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: + JLT repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short LEAL -65536(BP), BP MOVL BP, SI MOVW $0x001d, (AX) @@ -4403,30 +6718,30 @@ repeat_five_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_four_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: +repeat_four_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -256(BP), BP MOVW $0x0019, (AX) MOVW BP, 2(AX) ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_three_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: +repeat_three_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -4(BP), BP MOVW $0x0015, (AX) MOVB BP, 2(AX) ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_two_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: +repeat_two_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: SHLL $0x02, BP ORL $0x01, BP MOVW BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: +repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm4MB_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BP*4), BP MOVB SI, 1(AX) @@ -4435,14 +6750,14 @@ repeat_two_offset_repeat_as_copy_encodeBetterBlockAsm_emit_copy_short: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm - JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB + JMP two_byte_offset_repeat_as_copy_encodeBetterBlockAsm4MB -two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: +two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm4MB: CMPL BP, $0x0c - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm4MB CMPL SI, $0x00000800 - JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm + JGE emit_copy_three_repeat_as_copy_encodeBetterBlockAsm4MB MOVB $0x01, BL LEAL -16(BX)(BP*4), BP MOVB SI, 1(AX) @@ -4451,28 +6766,28 @@ two_byte_offset_short_repeat_as_copy_encodeBetterBlockAsm: ORL SI, BP MOVB BP, (AX) ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm + JMP repeat_end_emit_encodeBetterBlockAsm4MB -emit_copy_three_repeat_as_copy_encodeBetterBlockAsm: +emit_copy_three_repeat_as_copy_encodeBetterBlockAsm4MB: MOVB $0x02, BL LEAL -4(BX)(BP*4), BP MOVB BP, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX -repeat_end_emit_encodeBetterBlockAsm: +repeat_end_emit_encodeBetterBlockAsm4MB: MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm + JMP search_loop_encodeBetterBlockAsm4MB -no_repeat_found_encodeBetterBlockAsm: +no_repeat_found_encodeBetterBlockAsm4MB: CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm + JEQ candidate_match_encodeBetterBlockAsm4MB CMPL (DX)(DI*1), SI - JEQ candidateS_match_encodeBetterBlockAsm + JEQ candidateS_match_encodeBetterBlockAsm4MB MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm + JMP search_loop_encodeBetterBlockAsm4MB -candidateS_match_encodeBetterBlockAsm: +candidateS_match_encodeBetterBlockAsm4MB: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x08, R9 @@ -4482,37 +6797,37 @@ candidateS_match_encodeBetterBlockAsm: INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BP*1), SI - JEQ candidate_match_encodeBetterBlockAsm + JEQ candidate_match_encodeBetterBlockAsm4MB DECL CX MOVL DI, BP -candidate_match_encodeBetterBlockAsm: +candidate_match_encodeBetterBlockAsm4MB: MOVL 12(SP), SI TESTL BP, BP - JZ match_extend_back_end_encodeBetterBlockAsm + JZ match_extend_back_end_encodeBetterBlockAsm4MB -match_extend_back_loop_encodeBetterBlockAsm: +match_extend_back_loop_encodeBetterBlockAsm4MB: CMPL CX, SI - JLE match_extend_back_end_encodeBetterBlockAsm + JLE match_extend_back_end_encodeBetterBlockAsm4MB MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI CMPB BL, DI - JNE match_extend_back_end_encodeBetterBlockAsm + JNE match_extend_back_end_encodeBetterBlockAsm4MB LEAL -1(CX), CX DECL BP - JZ match_extend_back_end_encodeBetterBlockAsm - JMP match_extend_back_loop_encodeBetterBlockAsm + JZ match_extend_back_end_encodeBetterBlockAsm4MB + JMP match_extend_back_loop_encodeBetterBlockAsm4MB -match_extend_back_end_encodeBetterBlockAsm: +match_extend_back_end_encodeBetterBlockAsm4MB: MOVL CX, SI SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI + LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) - JL match_dst_size_check_encodeBetterBlockAsm + JL match_dst_size_check_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -match_dst_size_check_encodeBetterBlockAsm: +match_dst_size_check_encodeBetterBlockAsm4MB: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BP @@ -4524,149 +6839,141 @@ match_dst_size_check_encodeBetterBlockAsm: // matchLen XORL R11, R11 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm + JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB -matchlen_loopback_match_nolit_encodeBetterBlockAsm: +matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm + JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB BSFQ R10, R10 SARQ $0x03, R10 LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm + JMP match_nolit_end_encodeBetterBlockAsm4MB -matchlen_loop_match_nolit_encodeBetterBlockAsm: +matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 - JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm + JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB -matchlen_single_match_nolit_encodeBetterBlockAsm: +matchlen_single_match_nolit_encodeBetterBlockAsm4MB: TESTL DI, DI - JZ match_nolit_end_encodeBetterBlockAsm + JZ match_nolit_end_encodeBetterBlockAsm4MB -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: +matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeBetterBlockAsm + JNE match_nolit_end_encodeBetterBlockAsm4MB LEAL 1(R11), R11 DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm + JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB -match_nolit_end_encodeBetterBlockAsm: +match_nolit_end_encodeBetterBlockAsm4MB: MOVL CX, DI SUBL BP, DI CMPL R11, $0x01 - JG match_length_ok_encodeBetterBlockAsm + JG match_length_ok_encodeBetterBlockAsm4MB CMPL DI, $0x0000ffff - JLE match_length_ok_encodeBetterBlockAsm + JLE match_length_ok_encodeBetterBlockAsm4MB MOVL 20(SP), CX INCL CX - JMP search_loop_encodeBetterBlockAsm + JMP search_loop_encodeBetterBlockAsm4MB -match_length_ok_encodeBetterBlockAsm: +match_length_ok_encodeBetterBlockAsm4MB: MOVL DI, 16(SP) MOVL 12(SP), BP CMPL BP, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm + JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BP*1), R8 SUBL BP, DI LEAL -1(DI), BP CMPL BP, $0x3c - JLT one_byte_match_emit_encodeBetterBlockAsm + JLT one_byte_match_emit_encodeBetterBlockAsm4MB CMPL BP, $0x00000100 - JLT two_bytes_match_emit_encodeBetterBlockAsm + JLT two_bytes_match_emit_encodeBetterBlockAsm4MB CMPL BP, $0x00010000 - JLT three_bytes_match_emit_encodeBetterBlockAsm - CMPL BP, $0x01000000 - JLT four_bytes_match_emit_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL BP, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -four_bytes_match_emit_encodeBetterBlockAsm: + JLT three_bytes_match_emit_encodeBetterBlockAsm4MB MOVL BP, R9 SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB -three_bytes_match_emit_encodeBetterBlockAsm: +three_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB -two_bytes_match_emit_encodeBetterBlockAsm: +two_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) MOVB BP, 1(AX) ADDQ $0x02, AX CMPL BP, $0x40 - JL memmove_match_emit_encodeBetterBlockAsm - JMP memmove_long_match_emit_encodeBetterBlockAsm + JL memmove_match_emit_encodeBetterBlockAsm4MB + JMP memmove_long_match_emit_encodeBetterBlockAsm4MB -one_byte_match_emit_encodeBetterBlockAsm: +one_byte_match_emit_encodeBetterBlockAsm4MB: SHLB $0x02, BP MOVB BP, (AX) ADDQ $0x01, AX -memmove_match_emit_encodeBetterBlockAsm: +memmove_match_emit_encodeBetterBlockAsm4MB: LEAQ (AX)(DI*1), BP // genMemMoveShort CMPQ DI, $0x03 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ DI, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 + JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ DI, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ DI, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2: +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2: MOVB (R8), R9 MOVB -1(R8)(DI*1), R8 MOVB R9, (AX) MOVB R8, -1(AX)(DI*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3: +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3: MOVW (R8), R9 MOVB 2(R8), R8 MOVW R9, (AX) MOVB R8, 2(AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (R8), R9 MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm + JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: +emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 @@ -4676,11 +6983,11 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) -memmove_end_copy_match_emit_encodeBetterBlockAsm: +memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: MOVQ BP, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm + JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB -memmove_long_match_emit_encodeBetterBlockAsm: +memmove_long_match_emit_encodeBetterBlockAsm4MB: LEAQ (AX)(DI*1), BP // genMemMoveLong @@ -4695,11 +7002,11 @@ memmove_long_match_emit_encodeBetterBlockAsm: MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(AX)(R12*1), R13 -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) @@ -4708,23 +7015,23 @@ emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: +emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BP, AX -emit_literal_done_match_emit_encodeBetterBlockAsm: +emit_literal_done_match_emit_encodeBetterBlockAsm4MB: ADDL R11, CX MOVL 16(SP), BP ADDL $0x04, R11 @@ -4732,44 +7039,33 @@ emit_literal_done_match_emit_encodeBetterBlockAsm: // emitCopy CMPL BP, $0x00010000 - JL two_byte_offset_match_nolit_encodeBetterBlockAsm + JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB -four_bytes_loop_back_match_nolit_encodeBetterBlockAsm: +four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB: CMPL R11, $0x40 - JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm + JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB MOVB $0xff, (AX) MOVL BP, 1(AX) LEAL -64(R11), R11 ADDQ $0x05, AX CMPL R11, $0x04 - JL four_bytes_remain_match_nolit_encodeBetterBlockAsm + JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: MOVL R11, DI LEAL -4(R11), R11 CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL BP, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: CMPL R11, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL R11, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R11, $0x0100ffff - JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy - LEAL -16842747(R11), R11 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy LEAL -65536(R11), R11 MOVL R11, BP MOVW $0x001d, (AX) @@ -4777,30 +7073,30 @@ repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: XORQ DI, DI LEAL 1(DI)(R11*4), R11 MOVB BP, 1(AX) @@ -4809,53 +7105,42 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: ORL BP, R11 MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB -four_bytes_remain_match_nolit_encodeBetterBlockAsm: +four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: TESTL R11, R11 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm + JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB MOVB $0x03, BL LEAL -4(BX)(R11*4), R11 MOVB R11, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -two_byte_offset_match_nolit_encodeBetterBlockAsm: +two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: CMPL R11, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB MOVB $0xee, (AX) MOVW BP, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: MOVL R11, DI LEAL -4(R11), R11 CMPL DI, $0x08 - JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL BP, $0x00000800 - JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: CMPL R11, $0x00000104 - JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL R11, $0x00010100 - JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R11, $0x0100ffff - JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short - LEAL -16842747(R11), R11 - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short LEAL -65536(R11), R11 MOVL R11, BP MOVW $0x001d, (AX) @@ -4863,30 +7148,30 @@ repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: XORQ DI, DI LEAL 1(DI)(R11*4), R11 MOVB BP, 1(AX) @@ -4895,14 +7180,14 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: ORL BP, R11 MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB -two_byte_offset_short_match_nolit_encodeBetterBlockAsm: +two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: CMPL R11, $0x0c - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB CMPL BP, $0x00000800 - JGE emit_copy_three_match_nolit_encodeBetterBlockAsm + JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB MOVB $0x01, BL LEAL -16(BX)(R11*4), R11 MOVB BP, 1(AX) @@ -4911,24 +7196,24 @@ two_byte_offset_short_match_nolit_encodeBetterBlockAsm: ORL BP, R11 MOVB R11, (AX) ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB -emit_copy_three_match_nolit_encodeBetterBlockAsm: +emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: MOVB $0x02, BL LEAL -4(BX)(R11*4), R11 MOVB R11, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX -match_nolit_emitcopy_end_encodeBetterBlockAsm: +match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: CMPL CX, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm + JGE emit_remainder_encodeBetterBlockAsm4MB CMPQ AX, (SP) - JL match_nolit_dst_ok_encodeBetterBlockAsm + JL match_nolit_dst_ok_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -match_nolit_dst_ok_encodeBetterBlockAsm: +match_nolit_dst_ok_encodeBetterBlockAsm4MB: MOVQ $0x00cf1bbcdcbfa563, BP MOVQ $0x9e3779b1, DI INCL SI @@ -4959,120 +7244,112 @@ match_nolit_dst_ok_encodeBetterBlockAsm: SHRQ $0x32, R10 MOVL R8, 24(SP)(R9*4) MOVL SI, 262168(SP)(R10*4) - JMP search_loop_encodeBetterBlockAsm + JMP search_loop_encodeBetterBlockAsm4MB -emit_remainder_encodeBetterBlockAsm: +emit_remainder_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), CX SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX + LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) - JL emit_remainder_ok_encodeBetterBlockAsm + JL emit_remainder_ok_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET -emit_remainder_ok_encodeBetterBlockAsm: +emit_remainder_ok_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm + JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB MOVL CX, BP MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, BP LEAL -1(BP), DX CMPL DX, $0x3c - JLT one_byte_emit_remainder_encodeBetterBlockAsm + JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB CMPL DX, $0x00000100 - JLT two_bytes_emit_remainder_encodeBetterBlockAsm + JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB CMPL DX, $0x00010000 - JLT three_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x01000000 - JLT four_bytes_emit_remainder_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -four_bytes_emit_remainder_encodeBetterBlockAsm: + JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB -three_bytes_emit_remainder_encodeBetterBlockAsm: +three_bytes_emit_remainder_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB -two_bytes_emit_remainder_encodeBetterBlockAsm: +two_bytes_emit_remainder_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 - JL memmove_emit_remainder_encodeBetterBlockAsm - JMP memmove_long_emit_remainder_encodeBetterBlockAsm + JL memmove_emit_remainder_encodeBetterBlockAsm4MB + JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB -one_byte_emit_remainder_encodeBetterBlockAsm: +one_byte_emit_remainder_encodeBetterBlockAsm4MB: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX -memmove_emit_remainder_encodeBetterBlockAsm: +memmove_emit_remainder_encodeBetterBlockAsm4MB: LEAQ (AX)(BP*1), DX MOVL BP, BX // genMemMoveShort CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 + JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: MOVB (CX), BP MOVB -1(CX)(BX*1), CL MOVB BP, (AX) MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: MOVW (CX), BP MOVB 2(CX), CL MOVW BP, (AX) MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (CX), BP MOVL -4(CX)(BX*1), CX MOVL BP, (AX) MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (CX), BP MOVQ -8(CX)(BX*1), CX MOVQ BP, (AX) MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 @@ -5082,11 +7359,11 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) -memmove_end_copy_emit_remainder_encodeBetterBlockAsm: +memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm + JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB -memmove_long_emit_remainder_encodeBetterBlockAsm: +memmove_long_emit_remainder_encodeBetterBlockAsm4MB: LEAQ (AX)(BP*1), DX MOVL BP, BX @@ -5102,11 +7379,11 @@ memmove_long_emit_remainder_encodeBetterBlockAsm: MOVQ $0x00000040, DI SUBQ BP, DI DECQ SI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(CX)(DI*1), BP LEAQ -32(AX)(DI*1), R8 -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (BP), X4 MOVOU 16(BP), X5 MOVOA X4, (R8) @@ -5115,23 +7392,23 @@ emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: ADDQ $0x20, BP ADDQ $0x20, DI DECQ SI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back + JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: +emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(CX)(DI*1), X4 MOVOU -16(CX)(DI*1), X5 MOVOA X4, -32(AX)(DI*1) MOVOA X5, -16(AX)(DI*1) ADDQ $0x20, DI CMPQ BX, DI - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX -emit_literal_done_emit_remainder_encodeBetterBlockAsm: +emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) From b5b73be9b80deece74d4d22c8ea2d28d2b111342 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 25 Feb 2021 10:52:31 +0100 Subject: [PATCH 10/10] Update README.md --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d9f9249a4e..637ed0ca41 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,13 @@ This package provides various compression algorithms. [![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge) # changelog - +* Feb 25, 2021 (v1.11.8) + * s2: Fixed occational out-of-bounds write on amd64. Upgrade recommended. + * s2: Add AMD64 assembly for better mode. 25-50% faster. [#315](https://github.com/klauspost/compress/pull/315) + * s2: Less upfront decoder allocation. [#322](https://github.com/klauspost/compress/pull/322) + * zstd: Faster "compression" of incompressible data. [#314](https://github.com/klauspost/compress/pull/314) + * zip: Fix zip64 headers. [#313](https://github.com/klauspost/compress/pull/313) + * Jan 14, 2021 (v1.11.7) * Use Bytes() interface to get bytes across packages. [#309](https://github.com/klauspost/compress/pull/309) * s2: Add 'best' compression option. [#310](https://github.com/klauspost/compress/pull/310)