Skip to content

Commit

Permalink
Fix asm for dynamic linking
Browse files Browse the repository at this point in the history
R15 and CX are both clobbered under dynamic linking; avoid them.

Fixes #54
  • Loading branch information
cespare committed Aug 24, 2021
1 parent 3b9a658 commit e7a6b52
Showing 1 changed file with 31 additions and 31 deletions.
62 changes: 31 additions & 31 deletions xxhash_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

// Register allocation:
// AX h
// CX pointer to advance through b
// SI pointer to advance through b
// DX n
// BX loop end
// R8 v1, k1
Expand All @@ -16,39 +16,39 @@
// R12 tmp
// R13 prime1v
// R14 prime2v
// R15 prime4v
// DI prime4v

// round reads from and advances the buffer pointer in CX.
// round reads from and advances the buffer pointer in SI.
// It assumes that R13 has prime1v and R14 has prime2v.
#define round(r) \
MOVQ (CX), R12 \
ADDQ $8, CX \
MOVQ (SI), R12 \
ADDQ $8, SI \
IMULQ R14, R12 \
ADDQ R12, r \
ROLQ $31, r \
IMULQ R13, r

// mergeRound applies a merge round on the two registers acc and val.
// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
#define mergeRound(acc, val) \
IMULQ R14, val \
ROLQ $31, val \
IMULQ R13, val \
XORQ val, acc \
IMULQ R13, acc \
ADDQ R15, acc
ADDQ DI, acc

// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT, $0-32
// Load fixed primes.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
MOVQ ·prime4v(SB), R15
MOVQ ·prime4v(SB), DI

// Load slice.
MOVQ b_base+0(FP), CX
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), DX
LEAQ (CX)(DX*1), BX
LEAQ (SI)(DX*1), BX

// The first loop limit will be len(b)-32.
SUBQ $32, BX
Expand All @@ -65,14 +65,14 @@ TEXT ·Sum64(SB), NOSPLIT, $0-32
XORQ R11, R11
SUBQ R13, R11

// Loop until CX > BX.
// Loop until SI > BX.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)

CMPQ CX, BX
CMPQ SI, BX
JLE blockLoop

MOVQ R8, AX
Expand Down Expand Up @@ -100,35 +100,35 @@ noBlocks:
afterBlocks:
ADDQ DX, AX

// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
ADDQ $24, BX

CMPQ CX, BX
CMPQ SI, BX
JG fourByte

wordLoop:
// Calculate k1.
MOVQ (CX), R8
ADDQ $8, CX
MOVQ (SI), R8
ADDQ $8, SI
IMULQ R14, R8
ROLQ $31, R8
IMULQ R13, R8

XORQ R8, AX
ROLQ $27, AX
IMULQ R13, AX
ADDQ R15, AX
ADDQ DI, AX

CMPQ CX, BX
CMPQ SI, BX
JLE wordLoop

fourByte:
ADDQ $4, BX
CMPQ CX, BX
CMPQ SI, BX
JG singles

MOVL (CX), R8
ADDQ $4, CX
MOVL (SI), R8
ADDQ $4, SI
IMULQ R13, R8
XORQ R8, AX

Expand All @@ -138,19 +138,19 @@ fourByte:

singles:
ADDQ $4, BX
CMPQ CX, BX
CMPQ SI, BX
JGE finalize

singlesLoop:
MOVBQZX (CX), R12
ADDQ $1, CX
MOVBQZX (SI), R12
ADDQ $1, SI
IMULQ ·prime5v(SB), R12
XORQ R12, AX

ROLQ $11, AX
IMULQ R13, AX

CMPQ CX, BX
CMPQ SI, BX
JL singlesLoop

finalize:
Expand Down Expand Up @@ -179,9 +179,9 @@ TEXT ·writeBlocks(SB), NOSPLIT, $0-40
MOVQ ·prime2v(SB), R14

// Load slice.
MOVQ b_base+8(FP), CX
MOVQ b_base+8(FP), SI
MOVQ b_len+16(FP), DX
LEAQ (CX)(DX*1), BX
LEAQ (SI)(DX*1), BX
SUBQ $32, BX

// Load vN from d.
Expand All @@ -199,7 +199,7 @@ blockLoop:
round(R10)
round(R11)

CMPQ CX, BX
CMPQ SI, BX
JLE blockLoop

// Copy vN back to d.
Expand All @@ -208,8 +208,8 @@ blockLoop:
MOVQ R10, 16(AX)
MOVQ R11, 24(AX)

// The number of bytes written is CX minus the old base pointer.
SUBQ b_base+8(FP), CX
MOVQ CX, ret+32(FP)
// The number of bytes written is SI minus the old base pointer.
SUBQ b_base+8(FP), SI
MOVQ SI, ret+32(FP)

RET

0 comments on commit e7a6b52

Please sign in to comment.