diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index 6b480af39b..6c06969ea9 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -1501,6 +1501,7 @@ func (o options) genEmitLiteral() { Pragma("noescape") dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64() + restore := saveBP() Load(Param("lit").Len(), litLen) Load(Param("dst").Base(), dstBase) Load(Param("lit").Base(), litBase) @@ -1513,6 +1514,7 @@ func (o options) genEmitLiteral() { Label("emit_literal_end_standalone") Store(retval, ReturnIndex(0)) + restore() RET() } @@ -1669,6 +1671,7 @@ func (o options) genEmitRepeat() { Pragma("noescape") dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() + restore := saveBP() // retval = 0 XORQ(retval, retval) @@ -1679,6 +1682,7 @@ func (o options) genEmitRepeat() { o.emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end")) Label("gen_emit_repeat_end") Store(retval, ReturnIndex(0)) + restore() RET() } @@ -1824,16 +1828,17 @@ func (o options) genEmitCopy() { Pragma("noescape") dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() + restore := saveBP() // i := 0 XORQ(retval, retval) - Load(Param("dst").Base(), dstBase) Load(Param("offset"), offset) Load(Param("length"), length) o.emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end")) Label("gen_emit_copy_end") Store(retval, ReturnIndex(0)) + restore() RET() } @@ -1855,6 +1860,7 @@ func (o options) genEmitCopyNoRepeat() { Pragma("noescape") dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() + restore := saveBP() // i := 0 XORQ(retval, retval) @@ -1865,6 +1871,7 @@ func (o options) genEmitCopyNoRepeat() { o.emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy") Label("gen_emit_copy_end_snappy") Store(retval, ReturnIndex(0)) + restore() RET() } @@ -2368,12 +2375,15 @@ func (o options) genMatchLen() { aBase, bBase, length := GP64(), GP64(), GP64() + restore := saveBP() + Load(Param("a").Base(), aBase) Load(Param("b").Base(), bBase) Load(Param("a").Len(), length) l := o.matchLen("standalone", aBase, bBase, length, LabelRef("gen_match_len_end")) Label("gen_match_len_end") Store(l.As64(), ReturnIndex(0)) + restore() RET() } @@ -2519,3 +2529,12 @@ func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) JMP(end) return matched } + +// saveBP will save RBP in an XMM register and restore it when returning. +func saveBP() (restore func()) { + x := XMM() + MOVQ(reg.RBP, x) + return func() { + MOVQ(x, reg.RBP) + } +} diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 748c1c2e20..f065b48da1 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -12603,221 +12603,224 @@ gen_emit_repeat_end: RET // func emitCopy(dst []byte, offset int, length int) int +// Requires: SSE2 TEXT ·emitCopy(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX + XORQ AX, AX + MOVQ BP, X0 + MOVQ dst_base+0(FP), CX + MOVQ offset+24(FP), DX + MOVQ length+32(FP), BX // emitCopy - CMPL CX, $0x00010000 + CMPL DX, $0x00010000 JL two_byte_offset_standalone four_bytes_loop_back_standalone: - CMPL DX, $0x40 + CMPL BX, $0x40 JLE four_bytes_remain_standalone - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX + MOVB $0xff, (CX) + MOVL DX, 1(CX) + LEAL -64(BX), BX ADDQ $0x05, AX - CMPL DX, $0x04 + ADDQ $0x05, CX + CMPL BX, $0x04 JL four_bytes_remain_standalone // emitRepeat emit_repeat_again_standalone_emit_copy: - MOVL DX, BP - LEAL -4(DX), DX + MOVL BX, BP + LEAL -4(BX), BX CMPL BP, $0x08 JLE repeat_two_standalone_emit_copy CMPL BP, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy - CMPL CX, $0x00000800 + CMPL DX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy cant_repeat_two_offset_standalone_emit_copy: - CMPL DX, $0x00000104 + CMPL BX, $0x00000104 JLT repeat_three_standalone_emit_copy - CMPL DX, $0x00010100 + CMPL BX, $0x00010100 JLT repeat_four_standalone_emit_copy - CMPL DX, $0x0100ffff + CMPL BX, $0x0100ffff JLT repeat_five_standalone_emit_copy - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) + LEAL -16842747(BX), BX + MOVW $0x001d, (CX) + MOVW $0xfffb, 2(CX) + MOVB $0xff, 4(CX) + ADDQ $0x05, CX ADDQ $0x05, AX - ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy repeat_five_standalone_emit_copy: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX + LEAL -65536(BX), BX + MOVL BX, DX + MOVW $0x001d, (CX) + MOVW BX, 2(CX) + SARL $0x10, DX + MOVB DL, 4(CX) ADDQ $0x05, AX + ADDQ $0x05, CX JMP gen_emit_copy_end repeat_four_standalone_emit_copy: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX + LEAL -256(BX), BX + MOVW $0x0019, (CX) + MOVW BX, 2(CX) ADDQ $0x04, AX + ADDQ $0x04, CX JMP gen_emit_copy_end repeat_three_standalone_emit_copy: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX + LEAL -4(BX), BX + MOVW $0x0015, (CX) + MOVB BL, 2(CX) ADDQ $0x03, AX + ADDQ $0x03, CX JMP gen_emit_copy_end repeat_two_standalone_emit_copy: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (CX) ADDQ $0x02, AX + ADDQ $0x02, CX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy: XORQ BP, BP - LEAL 1(BP)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX + LEAL 1(BP)(BX*4), BX + MOVB DL, 1(CX) + SARL $0x08, DX + SHLL $0x05, DX + ORL DX, BX + MOVB BL, (CX) + ADDQ $0x02, AX + ADDQ $0x02, CX JMP gen_emit_copy_end JMP four_bytes_loop_back_standalone four_bytes_remain_standalone: - TESTL DX, DX + TESTL BX, BX JZ gen_emit_copy_end MOVB $0x03, BP - LEAL -4(BP)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX + LEAL -4(BP)(BX*4), BX + MOVB BL, (CX) + MOVL DX, 1(CX) ADDQ $0x05, AX + ADDQ $0x05, CX JMP gen_emit_copy_end two_byte_offset_standalone: - CMPL DX, $0x40 + CMPL BX, $0x40 JLE two_byte_offset_short_standalone - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX + MOVB $0xee, (CX) + MOVW DX, 1(CX) + LEAL -60(BX), BX + ADDQ $0x03, CX ADDQ $0x03, AX - ADDQ $0x03, BX // emitRepeat emit_repeat_again_standalone_emit_copy_short: - MOVL DX, BP - LEAL -4(DX), DX + MOVL BX, BP + LEAL -4(BX), BX CMPL BP, $0x08 JLE repeat_two_standalone_emit_copy_short CMPL BP, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy_short - CMPL CX, $0x00000800 + CMPL DX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy_short cant_repeat_two_offset_standalone_emit_copy_short: - CMPL DX, $0x00000104 + CMPL BX, $0x00000104 JLT repeat_three_standalone_emit_copy_short - CMPL DX, $0x00010100 + CMPL BX, $0x00010100 JLT repeat_four_standalone_emit_copy_short - CMPL DX, $0x0100ffff + CMPL BX, $0x0100ffff JLT repeat_five_standalone_emit_copy_short - LEAL -16842747(DX), DX - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) + LEAL -16842747(BX), BX + MOVW $0x001d, (CX) + MOVW $0xfffb, 2(CX) + MOVB $0xff, 4(CX) + ADDQ $0x05, CX ADDQ $0x05, AX - ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy_short repeat_five_standalone_emit_copy_short: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX + LEAL -65536(BX), BX + MOVL BX, DX + MOVW $0x001d, (CX) + MOVW BX, 2(CX) + SARL $0x10, DX + MOVB DL, 4(CX) ADDQ $0x05, AX + ADDQ $0x05, CX JMP gen_emit_copy_end repeat_four_standalone_emit_copy_short: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX + LEAL -256(BX), BX + MOVW $0x0019, (CX) + MOVW BX, 2(CX) ADDQ $0x04, AX + ADDQ $0x04, CX JMP gen_emit_copy_end repeat_three_standalone_emit_copy_short: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX + LEAL -4(BX), BX + MOVW $0x0015, (CX) + MOVB BL, 2(CX) ADDQ $0x03, AX + ADDQ $0x03, CX JMP gen_emit_copy_end repeat_two_standalone_emit_copy_short: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX + SHLL $0x02, BX + ORL $0x01, BX + MOVW BX, (CX) ADDQ $0x02, AX + ADDQ $0x02, CX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy_short: XORQ BP, BP - LEAL 1(BP)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX + LEAL 1(BP)(BX*4), BX + MOVB DL, 1(CX) + SARL $0x08, DX + SHLL $0x05, DX + ORL DX, BX + MOVB BL, (CX) + ADDQ $0x02, AX + ADDQ $0x02, CX JMP gen_emit_copy_end JMP two_byte_offset_standalone two_byte_offset_short_standalone: - CMPL DX, $0x0c + CMPL BX, $0x0c JGE emit_copy_three_standalone - CMPL CX, $0x00000800 + CMPL DX, $0x00000800 JGE emit_copy_three_standalone MOVB $0x01, BP - LEAL -16(BP)(DX*4), DX - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX + LEAL -16(BP)(BX*4), BX + MOVB DL, 1(CX) + SHRL $0x08, DX + SHLL $0x05, DX + ORL DX, BX + MOVB BL, (CX) + ADDQ $0x02, AX + ADDQ $0x02, CX JMP gen_emit_copy_end emit_copy_three_standalone: MOVB $0x02, BP - LEAL -4(BP)(DX*4), DX - MOVB DL, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX + LEAL -4(BP)(BX*4), BX + MOVB BL, (CX) + MOVW DX, 1(CX) ADDQ $0x03, AX + ADDQ $0x03, CX gen_emit_copy_end: - MOVQ BX, ret+40(FP) + MOVQ AX, ret+40(FP) + MOVQ X0, BP RET // func emitCopyNoRepeat(dst []byte, offset int, length int) int diff --git a/zstd/enc_fast.go b/zstd/enc_fast.go index ba4a17e106..f107519e57 100644 --- a/zstd/enc_fast.go +++ b/zstd/enc_fast.go @@ -34,6 +34,7 @@ type fastEncoderDict struct { dictTable []tableEntry tableShardDirty [tableShardCnt]bool allDirty bool + dictContent []byte } // Encode mimmics functionality in zstd_fast.c @@ -625,6 +626,27 @@ encodeLoop: } } +func (e *fastEncoderDict) load3232(b []byte, i int32) uint32 { + if i >= 0 { + return load3232(b, i) + } + return 0 +} + +func (e *fastEncoderDict) load6432(b []byte, i int32) uint64 { + if i >= 0 { + return load6432(b, i) + } + return 0 +} + +func (e *fastEncoderDict) load64(b []byte, i int) uint64 { + if i >= 0 { + return load64(b, i) + } + return 0 +} + // Encode will encode the content, with a dictionary if initialized for it. func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) { const ( @@ -726,7 +748,7 @@ encodeLoop: e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)} e.markShardDirty(nextHash2) - if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) { + if canRepeat && repIndex >= 0 && e.load3232(src, repIndex) == uint32(cv>>16) { // Consider history as well. var seq seq var length int32 @@ -737,7 +759,7 @@ encodeLoop: endI := len(a) & (math.MaxInt32 - 7) length = int32(endI) + 4 for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { + if diff := e.load64(a, i) ^ e.load64(b, i); diff != 0 { length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 break } @@ -779,7 +801,7 @@ encodeLoop: } break encodeLoop } - cv = load6432(src, s) + cv = e.load6432(src, s) continue } coffset0 := s - (candidate.offset - e.cur) @@ -815,7 +837,7 @@ encodeLoop: if s >= sLimit { break encodeLoop } - cv = load6432(src, s) + cv = e.load6432(src, s) } // A 4-byte match has been found. We'll later see if more than 4 bytes. offset2 = offset1 @@ -838,7 +860,7 @@ encodeLoop: endI := len(a) & (math.MaxInt32 - 7) l = int32(endI) + 4 for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { + if diff := e.load64(a, i) ^ e.load64(b, i); diff != 0 { l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 break } @@ -874,10 +896,10 @@ encodeLoop: if s >= sLimit { break encodeLoop } - cv = load6432(src, s) + cv = e.load6432(src, s) // Check offset 2 - if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { + if o2 := s - offset2; canRepeat && e.load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match //l := 4 + e.matchlen(s+4, o2+4, src) @@ -888,7 +910,7 @@ encodeLoop: endI := len(a) & (math.MaxInt32 - 7) l = int32(endI) + 4 for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { + if diff := e.load64(a, i) ^ e.load64(b, i); diff != 0 { l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 break } @@ -916,7 +938,7 @@ encodeLoop: break encodeLoop } // Prepare next loop. - cv = load6432(src, s) + cv = e.load6432(src, s) } } @@ -941,12 +963,14 @@ func (e *fastEncoder) Reset(d *dict, singleBlock bool) { // ResetDict will reset and set a dictionary if not nil func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) { + e.resetBase(d, singleBlock) if d == nil { return } // Init or copy dict table + e.dictContent = d.content if len(e.dictTable) != len(e.table) || d.id != e.lastDictID { if len(e.dictTable) != len(e.table) { e.dictTable = make([]tableEntry, len(e.table))