diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index c939174b05..5c9b5dc57e 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -237,7 +237,6 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m }) cv := GP64() - MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) nextS := GP32() // nextS := s + (s-nextEmit)>>6 + 4 { @@ -252,6 +251,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m CMPL(nextS.As32(), sLimitL) JGE(LabelRef("emit_remainder_" + name)) } + MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) assert(func(ok LabelRef) { // Check if s is valid (we should have jumped above if not) tmp := GP64() @@ -888,7 +888,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash }) cv := GP64() - MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) nextS := GP32() // nextS := s + (s-nextEmit)>>skipLog + 1 { @@ -903,6 +902,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash CMPL(nextS.As32(), sLimitL) JGE(LabelRef("emit_remainder_" + name)) } + MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) assert(func(ok LabelRef) { // Check if s is valid (we should have jumped above if not) tmp := GP64() diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 916a9736fc..7d1ed1bd9c 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -40,13 +40,13 @@ zero_loop_encodeBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x06, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBlockAsm + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 @@ -1252,13 +1252,13 @@ zero_loop_encodeBlockAsm4MB: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm4MB: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x06, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBlockAsm4MB + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 @@ -2385,13 +2385,13 @@ zero_loop_encodeBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm12B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBlockAsm12B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, R9 @@ -3289,13 +3289,13 @@ zero_loop_encodeBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm10B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBlockAsm10B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 @@ -4193,13 +4193,13 @@ zero_loop_encodeBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm8B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x04, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBlockAsm8B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 @@ -5081,13 +5081,13 @@ zero_loop_encodeBetterBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x07, BP LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBetterBlockAsm + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R8 MOVQ $0x9e3779b1, BP @@ -6059,13 +6059,13 @@ zero_loop_encodeBetterBlockAsm4MB: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm4MB: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x07, BP LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBetterBlockAsm4MB + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R8 MOVQ $0x9e3779b1, BP @@ -6980,13 +6980,13 @@ zero_loop_encodeBetterBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm12B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x06, BP LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBetterBlockAsm12B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BP @@ -7755,13 +7755,13 @@ zero_loop_encodeBetterBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm10B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBetterBlockAsm10B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BP @@ -8530,13 +8530,13 @@ zero_loop_encodeBetterBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm8B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x04, BP LEAL 1(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeBetterBlockAsm8B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BP @@ -9295,13 +9295,13 @@ zero_loop_encodeSnappyBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x06, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 @@ -10169,13 +10169,13 @@ zero_loop_encodeSnappyBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm12B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm12B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, R9 @@ -10938,13 +10938,13 @@ zero_loop_encodeSnappyBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm10B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x05, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm10B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 @@ -11707,13 +11707,13 @@ zero_loop_encodeSnappyBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm8B: - MOVQ (DX)(CX*1), SI MOVL CX, BP SUBL 12(SP), BP SHRL $0x04, BP LEAL 4(CX)(BP*1), BP CMPL BP, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm8B + MOVQ (DX)(CX*1), SI MOVL BP, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9