From e7a6b52374f7e2abfb8abb27249d53a1997b09a7 Mon Sep 17 00:00:00 2001 From: Caleb Spare Date: Mon, 23 Aug 2021 23:25:48 -0700 Subject: [PATCH] Fix asm for dynamic linking R15 and CX are both clobbered under dynamic linking; avoid them. Fixes #54 --- xxhash_amd64.s | 62 +++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/xxhash_amd64.s b/xxhash_amd64.s index d580e32..be8db5b 100644 --- a/xxhash_amd64.s +++ b/xxhash_amd64.s @@ -6,7 +6,7 @@ // Register allocation: // AX h -// CX pointer to advance through b +// SI pointer to advance through b // DX n // BX loop end // R8 v1, k1 @@ -16,39 +16,39 @@ // R12 tmp // R13 prime1v // R14 prime2v -// R15 prime4v +// DI prime4v -// round reads from and advances the buffer pointer in CX. +// round reads from and advances the buffer pointer in SI. // It assumes that R13 has prime1v and R14 has prime2v. #define round(r) \ - MOVQ (CX), R12 \ - ADDQ $8, CX \ + MOVQ (SI), R12 \ + ADDQ $8, SI \ IMULQ R14, R12 \ ADDQ R12, r \ ROLQ $31, r \ IMULQ R13, r // mergeRound applies a merge round on the two registers acc and val. -// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v. +// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v. #define mergeRound(acc, val) \ IMULQ R14, val \ ROLQ $31, val \ IMULQ R13, val \ XORQ val, acc \ IMULQ R13, acc \ - ADDQ R15, acc + ADDQ DI, acc // func Sum64(b []byte) uint64 TEXT ·Sum64(SB), NOSPLIT, $0-32 // Load fixed primes. MOVQ ·prime1v(SB), R13 MOVQ ·prime2v(SB), R14 - MOVQ ·prime4v(SB), R15 + MOVQ ·prime4v(SB), DI // Load slice. - MOVQ b_base+0(FP), CX + MOVQ b_base+0(FP), SI MOVQ b_len+8(FP), DX - LEAQ (CX)(DX*1), BX + LEAQ (SI)(DX*1), BX // The first loop limit will be len(b)-32. SUBQ $32, BX @@ -65,14 +65,14 @@ TEXT ·Sum64(SB), NOSPLIT, $0-32 XORQ R11, R11 SUBQ R13, R11 - // Loop until CX > BX. + // Loop until SI > BX. blockLoop: round(R8) round(R9) round(R10) round(R11) - CMPQ CX, BX + CMPQ SI, BX JLE blockLoop MOVQ R8, AX @@ -100,16 +100,16 @@ noBlocks: afterBlocks: ADDQ DX, AX - // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8. + // Right now BX has len(b)-32, and we want to loop until SI > len(b)-8. ADDQ $24, BX - CMPQ CX, BX + CMPQ SI, BX JG fourByte wordLoop: // Calculate k1. - MOVQ (CX), R8 - ADDQ $8, CX + MOVQ (SI), R8 + ADDQ $8, SI IMULQ R14, R8 ROLQ $31, R8 IMULQ R13, R8 @@ -117,18 +117,18 @@ wordLoop: XORQ R8, AX ROLQ $27, AX IMULQ R13, AX - ADDQ R15, AX + ADDQ DI, AX - CMPQ CX, BX + CMPQ SI, BX JLE wordLoop fourByte: ADDQ $4, BX - CMPQ CX, BX + CMPQ SI, BX JG singles - MOVL (CX), R8 - ADDQ $4, CX + MOVL (SI), R8 + ADDQ $4, SI IMULQ R13, R8 XORQ R8, AX @@ -138,19 +138,19 @@ fourByte: singles: ADDQ $4, BX - CMPQ CX, BX + CMPQ SI, BX JGE finalize singlesLoop: - MOVBQZX (CX), R12 - ADDQ $1, CX + MOVBQZX (SI), R12 + ADDQ $1, SI IMULQ ·prime5v(SB), R12 XORQ R12, AX ROLQ $11, AX IMULQ R13, AX - CMPQ CX, BX + CMPQ SI, BX JL singlesLoop finalize: @@ -179,9 +179,9 @@ TEXT ·writeBlocks(SB), NOSPLIT, $0-40 MOVQ ·prime2v(SB), R14 // Load slice. - MOVQ b_base+8(FP), CX + MOVQ b_base+8(FP), SI MOVQ b_len+16(FP), DX - LEAQ (CX)(DX*1), BX + LEAQ (SI)(DX*1), BX SUBQ $32, BX // Load vN from d. @@ -199,7 +199,7 @@ blockLoop: round(R10) round(R11) - CMPQ CX, BX + CMPQ SI, BX JLE blockLoop // Copy vN back to d. @@ -208,8 +208,8 @@ blockLoop: MOVQ R10, 16(AX) MOVQ R11, 24(AX) - // The number of bytes written is CX minus the old base pointer. - SUBQ b_base+8(FP), CX - MOVQ CX, ret+32(FP) + // The number of bytes written is SI minus the old base pointer. + SUBQ b_base+8(FP), SI + MOVQ SI, ret+32(FP) RET