src/engine/x86-64/X86_64MacroAssembler.v3

// Copyright 2022 Ben L. Titzer. All rights reserved.
// See LICENSE for details of Apache 2.0 license.

def G = X86_64MasmRegs.toGpr, X = X86_64MasmRegs.toXmmr;
def R: X86_64Regs;
def C: X86_64Conds;
def A(ma: MasmAddr) -> X86_64Addr {
	return X86_64Addr.new(G(ma.base), null, 1, ma.offset);
}
def RT: X86_64Runtime;

class X86_64MasmLabel extends MasmLabel {
	def label: X86_64Label;

	new(create_pos: int, label) super(create_pos) {	}
}
class X86_64MacroAssembler extends MacroAssembler {
	def w: DataWriter;
	def asm = X86_64Assemblers.create64(w);
	var scratch: X86_64Gpr;
	var jump_tables: Vector<(int, Array<X86_64Label>)>;
	var offsets: V3Offsets;
	var trap_stubs: X86_64SpcTrapsStub;

	new(w, regConfig: RegConfig) super(Target.tagging, regConfig) {
		scratch = G(regConfig.scratch);
	}

	def curCodeBytes() -> u64 {
		return u64.!(w.end());
	}
	def setTargetAddress(addr: u64) {
		if (jump_tables != null) {
			for (i < jump_tables.length) {
				var t = jump_tables[i], offset = t.0, labels = t.1;
				for (j < labels.length) {
					var l = labels[j], target = addr + u64.!(l.pos);
					asm.w.at(offset + j * 8).put_b64(long.view(target));
				}
			}
			jump_tables = null;
		}
	}

	// Label operations
	def newLabel(create_pos: int) -> X86_64MasmLabel {
		return X86_64MasmLabel.new(create_pos, asm.newLabel());
	}
	def bindLabel(l: MasmLabel) {
		if (Trace.compiler) Trace.OUT.put2("    bind label (+%d) -> @%d", l.create_pos, w.end()).ln();
		var label = X86_64MasmLabel.!(l);
		asm.bind(label.label);
		label.offset = label.label.pos;
	}
	def bindLabelTo(l: MasmLabel, offset: int) {
		if (Trace.compiler) Trace.OUT.put2("    bind label (+%d) -> @%d", l.create_pos, offset).ln();
		var label = X86_64MasmLabel.!(l);
		label.offset = offset;
		label.label.pos = offset;
	}
	def recordCurSourceLoc() {
		recordSourceLoc(asm.pos());
	}
	def recordRetSourceLoc() {
		// To distinguish an entry for the return address of a call from an entry for a Wasm instruction
		// following the call, return addresses have a {-1} adjustment.
		recordSourceLoc(asm.pos() - 1);
	}
	def getScratchReg(kind: ValueKind) -> Reg {
		match (kind) {
			I32, I64, REF => return regConfig.scratch;
			F32, F64, V128 => return X86_64MasmRegs.XMM15; // TODO: make configurable
		}
	}
	def getV3ParamReg(kind: ValueKind, index: int) -> Reg {
		match (kind) {
			I32, I64, REF => return X86_64MasmRegs.PARAM_GPRS[index];
			_ => unimplemented();
		}
		return Reg(0);
	}
	def getV3ReturnReg(kind: ValueKind, index: int) -> Reg {
		match (kind) {
			I32, I64, REF => return X86_64MasmRegs.RET_GPRS[index];
			_ => unimplemented();
		}
		return Reg(0);
	}

	def emit_intentional_crash() {
		recordCurSourceLoc();
		asm.invalid();
	}

	def emit_read_v3_array_r_r(kind: ValueKind, dst: Reg, array: Reg, index: Reg) {
		var a = G(array), i = G(index);
		match (kind) {
			I32 => asm.movd_r_m(G(dst), X86_64Addr.new(a, i, 4, getOffsets().Array_contents));
			F32 => asm.movss_s_m(X(dst), X86_64Addr.new(a, i, 4, getOffsets().Array_contents));
			I64, REF => asm.movq_r_m(G(dst), X86_64Addr.new(a, i, 8, getOffsets().Array_contents));
			F64 => asm.movsd_s_m(X(dst), X86_64Addr.new(a, i, 8, getOffsets().Array_contents));
			V128 => asm.movdqu_s_m(X(dst), X86_64Addr.new(a, i, 16, getOffsets().Array_contents)); // TODO: can't scale by 16
		}
	}
	def emit_bounds_check_v3_array(array: Reg, index: Reg, oob_label: MasmLabel) {
		asm.d.cmp_r_m(G(index), X86_64Addr.new(G(array), null, 1, getOffsets().Array_length));
		asm.jc_rel_far(X86_64Conds.GE, X86_64MasmLabel.!(oob_label).label);
	}
	def emit_read_v3_mem_base(dst: Reg, memobj: Reg) {
		asm.movq_r_m(G(dst), X86_64Addr.new(G(memobj), null, 1, getOffsets().X86_64Memory_start));
	}

	def emit_loadbsx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32) {
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		var x = if (kind == ValueKind.I64, asm.q, asm.d).movbsx_r_m(G(dst), X86_64Addr.new(G(base), t.0, 1, t.1));
	}
	def emit_loadbzx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32) {
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		var x = if (kind == ValueKind.I64, asm.q, asm.d).movbzx_r_m(G(dst), X86_64Addr.new(G(base), t.0, 1, t.1));
	}
	def emit_loadwsx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32) {
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		var x = if (kind == ValueKind.I64, asm.q, asm.d).movwsx_r_m(G(dst), X86_64Addr.new(G(base), t.0, 1, t.1));
	}
	def emit_loadwzx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32) {
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		var x = if (kind == ValueKind.I64, asm.q, asm.d).movwzx_r_m(G(dst), X86_64Addr.new(G(base), t.0, 1, t.1));
	}
	def emit_loaddsx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32) {
		var d = G(dst);
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		asm.q.movd_r_m(d, X86_64Addr.new(G(base), t.0, 1, t.1));
		asm.q.shl_r_i(d, 32);
		asm.q.sar_r_i(d, 32);
	}
	def emit_loaddzx_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32) {
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		asm.q.movd_r_m(G(dst), X86_64Addr.new(G(base), t.0, 1, t.1));
	}
	def emit_load_r_r_r_i(kind: ValueKind, dst: Reg, base: Reg, index: Reg, offset: u32) {
		var b = G(base), t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		match (kind) {
			I32 => asm.movd_r_m(G(dst), X86_64Addr.new(b, t.0, 1, t.1));
			REF, I64 => asm.movq_r_m(G(dst), X86_64Addr.new(b, t.0, 1, t.1));
			F32 => asm.movss_s_m(X(dst), X86_64Addr.new(b, t.0, 1, t.1));
			F64 => asm.movsd_s_m(X(dst), X86_64Addr.new(b, t.0, 1, t.1));
			V128 => asm.movdqu_s_m(X(dst), X86_64Addr.new(b, t.0, 1, t.1));
		}
	}
	def emit_v128_load_lane_r_m<T>(dst: Reg, src: X86_64Addr, asm_mov_r_m: (X86_64Gpr, X86_64Addr) -> T) {
		recordCurSourceLoc();
		asm_mov_r_m(G(dst), src);
	}
	def emit_v128_store_lane_m_r<T>(dst: X86_64Addr, src: Reg, asm_mov_m_r: (X86_64Addr, X86_64Gpr) -> T) {
		recordCurSourceLoc();
		asm_mov_m_r(dst, G(src));
	}
	def decode_memarg_addr(base: Reg, index: Reg, offset: u32) -> X86_64Addr {
		var t = handle_large_offset(index, offset);
		return X86_64Addr.new(G(base), t.0, 1, t.1);
	}
	def emit_storeb_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32) {
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		asm.q.movb_m_r(X86_64Addr.new(G(base), t.0, 1, t.1), G(val));
	}
	def emit_storew_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32) {
		var t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		asm.q.movw_m_r(X86_64Addr.new(G(base), t.0, 1, t.1), G(val));
	}
	def emit_store_r_r_r_i(kind: ValueKind, val: Reg, base: Reg, index: Reg, offset: u32) {
		var b = G(base), t = handle_large_offset(index, offset);
		recordCurSourceLoc();
		match (kind) {
			I32 => asm.movd_m_r(X86_64Addr.new(b, t.0, 1, t.1), G(val));
			REF, I64 => asm.movq_m_r(X86_64Addr.new(b, t.0, 1, t.1), G(val));
			F32 => asm.movss_m_s(X86_64Addr.new(b, t.0, 1, t.1), X(val));
			F64 => asm.movsd_m_s(X86_64Addr.new(b, t.0, 1, t.1), X(val));
			V128 => asm.movdqu_m_s(X86_64Addr.new(b, t.0, 1, t.1), X(val));
		}
	}

	private def handle_large_offset(index: Reg, offset: u32) -> (X86_64Gpr, int) {
		var ioffset = int.view(offset), r = G(index);
		if (ioffset != offset) { // too large for signed encoding, add explicitly
			asm.movd_r_i(scratch, ioffset);
			if (r != null) asm.q.add_r_r(scratch, r);
			return (scratch, 0);
		}
		return (r, int.view(offset));
	}

	def emit_mov_r_r(kind: ValueKind, reg: Reg, reg2: Reg) {
		var rd = G(reg);
		if (rd != null) {
			var rs = G(reg2);
			if (rs != null) asm.movq_r_r(rd, G(reg2));
			else asm.movq_r_s(rd, X(reg2));
		} else {
			var xd = X(reg), xs = X(reg2);
			if (xs != null) {
				if (kind == ValueKind.V128) asm.movaps_s_s(xd, xs);
				else asm.movsd_s_s(xd, xs);
			} else {
				asm.movq_s_r(xd, G(reg2));
			}
		}
	}
	def emit_mov_r_m(kind: ValueKind, reg: Reg, ma: MasmAddr) {
		var addr = A(ma);
		match (kind) {
			I32 => asm.movd_r_m(G(reg), addr);
			I64, REF => asm.movq_r_m(G(reg), addr);
			F32 => asm.movss_s_m(X(reg), addr);
			F64 => asm.movsd_s_m(X(reg), addr);
			V128 => asm.movdqu_s_m(X(reg), addr);
		}
	}
	def emit_mov_r_i(reg: Reg, val: int) {
		asm.movd_r_i(G(reg), val);
	}
	def emit_mov_r_l32(reg: Reg, val: int) {
		asm.movq_r_i(G(reg), val);
	}
	def emit_mov_r_f32(reg: Reg, val: u32) {
		asm.movd_r_i(scratch, int.view(val));
		asm.movd_s_r(X(reg), scratch);
	}
	def emit_mov_r_l(reg: Reg, val: long) {
		asm.movq_r_l(G(reg), long.view(val));
	}
	def emit_mov_r_d64(reg: Reg, val: u64) {
		var x = X(reg);
		if (val == 0) return void(asm.xorpd_s_s(x, x));
		asm.movq_r_l(scratch, long.view(val));
		asm.movq_s_r(X(reg), scratch);
	}
	def emit_mov_r_q(reg: Reg, low: u64, high: u64) {
		var x = X(reg);
		asm.movq_r_l(scratch, long.view(low));
		asm.pinsrq_s_r_i(x, scratch, 0);
		asm.movq_r_l(scratch, long.view(high));
		asm.pinsrq_s_r_i(x, scratch, 1);
	}
	def emit_mov_r_trap(reg: Reg, reason: TrapReason) {
		var ptr = Pointer.atObject(Execute.trapObjects[reason.tag]);
		asm.movq_r_l(G(reg), ptr - Pointer.NULL);
	}
	def emit_mov_r_abs32(reg: Reg, abs: Pointer) {
		asm.movd_r_i(G(reg), int.view(u32.!(abs - Pointer.NULL)));
	}

	def emit_mov_m_r(kind: ValueKind, ma: MasmAddr, reg: Reg) {
		var addr = A(ma);
		match (kind) {
			I32 => asm.movd_m_r(addr, G(reg));
			I64, REF => asm.movq_m_r(addr, G(reg));
			F32 => asm.movss_m_s(addr, X(reg));
			F64 => asm.movsd_m_s(addr, X(reg));
			V128 => asm.movdqu_m_s(addr, X(reg));
		}
	}
	def emit_mov_m_i(ma: MasmAddr, val: int) {
		asm.movd_m_i(A(ma), val);
	}
	def emit_mov_m_l(ma: MasmAddr, val: long) {
		var addr = A(ma);
		if (val == int.view(val)) asm.movq_m_i(addr, int.view(val));
		else {  // XXX: use constant pool?
			asm.movd_m_i(addr, int.view(val));
			var p4 = X86_64Addr.new(addr.base, addr.index, addr.scale, addr.disp + 4);
			asm.movd_m_i(p4, int.view(val >> 32));
		}
	}
	def emit_mov_m_f(ma: MasmAddr, bits: u32) {
		asm.movd_m_i(A(ma), int.view(bits));
	}
	def emit_mov_m_d(ma: MasmAddr, bits: u64) {
		emit_mov_m_l(ma, long.view(bits));
	}
	def emit_mov_m_q(ma: MasmAddr, low: u64, high: u64) {  // XXX: use constant pool?
		emit_mov_m_l(ma, long.view(low));
		emit_mov_m_l(MasmAddr(ma.base, ma.offset + 8), long.view(high));
	}
	def emit_mov_m_m(kind: ValueKind, dst: MasmAddr, src: MasmAddr) {
		match (kind) {
			I32, F32 => {
				asm.movd_r_m(scratch, A(src));
				asm.movd_m_r(A(dst), scratch);
			}
			I64, F64, REF => {
				asm.movq_r_m(scratch, A(src));
				asm.movq_m_r(A(dst), scratch);
			}
			V128 => {
				var scratch = R.XMM15; // TODO
				asm.movdqu_s_m(scratch, A(src));
				asm.movdqu_m_s(A(dst), scratch);
			}
		}
	}

	def emit_addi_r_r(reg: Reg, reg2: Reg) {
		asm.add_r_r(G(reg), G(reg2));
	}
	def emit_addi_r_i(reg: Reg, val: int) {
		asm.add_r_i(G(reg), val);
	}

	def emit_subw_r_i(reg: Reg, val: int) {
		asm.sub_r_i(G(reg), val);
	}
	def emit_subw_r_r(reg: Reg, reg2: Reg) {
		asm.sub_r_r(G(reg), G(reg2));
	}
	def emit_addw_r_i(r1: Reg, val: int) {
		asm.add_r_i(G(r1), val);
	}
	def emit_addw_r_r(r1: Reg, r2: Reg) {
		asm.add_r_r(G(r1), G(r2));
	}
	def emit_shlw_r_i(reg: Reg, imm: u6) {
		asm.shl_r_i(G(reg), imm);
	}
	def emit_shrw_r_i(reg: Reg, imm: u6) {
		asm.shr_r_i(G(reg), imm);
	}

	def emit_movq_32s_r_m(dst: X86_64Gpr, addr: X86_64Addr) -> X86_64Assembler {
		asm.movd_r_m(dst, addr);
		asm.q.shl_r_i(dst, 32);
		asm.q.sar_r_i(dst, 32);
		return asm;
	}

	def emit_i32_div_s(r2: X86_64Gpr) { // note: (r1=EAX, r2) -> EAX, kills EDX
		var r1 = R.RAX;
		var div = X86_64Label.new(), done = X86_64Label.new();
		asm.d.cmp_r_i(r2, -1);
		asm.jc_rel_near(C.NZ, div);
		asm.d.cmp_r_i(r1, 0x80000000);
		asm.jc_rel_far(C.Z, X86_64MasmLabel.!(newTrapLabel(TrapReason.DIV_UNREPRESENTABLE)).label);
		asm.d.neg_r(r1);
		asm.jmp_rel_near(done);
		asm.bind(div);
		asm.d.cdq();
		recordCurSourceLoc();
		asm.d.idiv_r(r2);
		asm.bind(done);
	}
	def emit_i32_div_u(r2: X86_64Gpr) { // note: (r1=EAX, r2) -> EAX, kills EDX
		asm.d.movd_r_i(R.RDX, 0);
		recordCurSourceLoc();
		asm.d.div_r(r2);
	}
	def emit_i32_rem_s(r2: X86_64Gpr) { // note: (r1=EAX, r2) -> EDX, kills RAX
		var r1 = R.RDX;
		var div = X86_64Label.new(), done = X86_64Label.new();
		asm.d.cmp_r_i(r2, -1);
		asm.jc_rel_near(C.NZ, div);
		asm.movd_r_i(r1, 0);
		asm.jmp_rel_near(done);
		asm.bind(div);
		asm.d.cdq();
		recordCurSourceLoc();
		asm.d.idiv_r(r2);
		asm.bind(done);
	}
	def emit_i32_rem_u(r2: X86_64Gpr) { // note: (r1=EAX, r2) -> EDX, kills RAX
		asm.movd_r_i(R.RDX, 0);
		recordCurSourceLoc();
		asm.d.div_r(r2);
	}
	def emit_i64_div_s(r2: X86_64Gpr) { // note: (r1=RAX, r2) -> RAX, kills RDX
		var div = X86_64Label.new(), done = X86_64Label.new();
		var r1 = R.RAX;
		asm.q.cmp_r_i(r2, -1);
		asm.jc_rel_near(C.NZ, div);
		asm.movq_r_i(scratch, 0x80); // XXX: use BTS
		asm.q.shl_r_i(scratch, 56);
		asm.q.cmp_r_r(r1, scratch);
		asm.jc_rel_far(C.Z, X86_64MasmLabel.!(newTrapLabel(TrapReason.DIV_UNREPRESENTABLE)).label);
		asm.q.neg_r(r1);
		asm.jmp_rel_near(done);
		asm.bind(div);
		asm.q.cqo();
		recordCurSourceLoc();
		asm.q.idiv_r(r2);
		asm.bind(done);
	}
	def emit_i64_div_u(r2: X86_64Gpr) { // note: (r1=RAX, r2) -> RAX, kills RDX
		asm.movd_r_i(R.RDX, 0);
		recordCurSourceLoc();
		asm.div_r(r2);
	}
	def emit_i64_rem_s(r2: X86_64Gpr) { // note: (r1=RAX, r2) -> RDX, kills RAX
		var r1 = R.RAX;
		var dst = R.RDX;
		var div = X86_64Label.new(), done = X86_64Label.new();
		asm.cmp_r_i(r2, -1);
		asm.jc_rel_near(C.NZ, div);
		asm.movq_r_i(dst, 0);
		asm.jmp_rel_near(done);
		asm.bind(div);
		asm.cqo();
		recordCurSourceLoc();
		asm.idiv_r(r2);
		asm.bind(done);
	}
	def emit_i64_rem_u(r2: X86_64Gpr) { // note: (r1=RAX, r2) -> RDX, kills RAX
		asm.movd_r_i(R.RDX, 0);
		recordCurSourceLoc();
		asm.div_r(r2);
	}

	def emit_binop_r_r(op: Opcode, reg: Reg, reg2: Reg) {
		unimplemented();
	}
	def emit_binop_r_m(op: Opcode, reg: Reg, ma: MasmAddr) {
		unimplemented();
	}
	def emit_binop_r_i(op: Opcode, reg: Reg, val: int) {
		unimplemented();
	}
	def emit_cmpq_r_i(cond: X86_64Cond, r1: X86_64Gpr, val: int) {
		asm.q.cmp_r_i(r1, val);
		asm.set_r(cond, r1);
		asm.q.movbzx_r_r(r1, r1);
	}
	def emit_cmpq_r_r_i(cond: X86_64Cond, r1: X86_64Gpr, r2: X86_64Gpr, val: int) {
		asm.q.cmp_r_i(r2, val);
		asm.set_r(cond, r1);
		asm.d.movbzx_r_r(r1, r1);
	}
	def emit_cmpq_r_r_r(cond: X86_64Cond, r1: X86_64Gpr, r2: X86_64Gpr, r3: X86_64Gpr) {
		asm.q.cmp_r_r(r2, r3);
		asm.set_r(cond, r1);
		asm.d.movbzx_r_r(r1, r1);
	}
	def emit_cmpq_r_r(cond: X86_64Cond, r1: X86_64Gpr, r2: X86_64Gpr) {
		asm.q.cmp_r_r(r1, r2);
		asm.set_r(cond, r1);
		asm.q.movbzx_r_r(r1, r1);
	}
	def emit_cmpq_r_m(cond: X86_64Cond, r1: X86_64Gpr, addr: X86_64Addr) {
		asm.q.cmp_r_m(r1, addr);
		asm.set_r(cond, r1);
		asm.q.movbzx_r_r(r1, r1);
	}
	def emit_pop_r(kind: ValueKind, reg: Reg) {
		match (kind) {
			I32 => asm.d.popq_r(G(reg));
			F32 => { asm.d.popq_r(scratch); asm.movd_s_r(X(reg), scratch); }
			I64, REF => asm.q.popq_r(G(reg));
			F64 => { asm.q.popq_r(scratch);  asm.movq_s_r(X(reg), scratch); }
			V128 => System.error("X86_64MacroAssembler", "cannot pop v128 type"); // TODO
		}
	}
	def emit_ret() {
		asm.ret();
	}
	def emit_nop() {
		asm.q.or_r_r(R.RAX, R.RAX);
	}

	def emit_i_trunc_f(op: Opcode, dst: X86_64Gpr, x1: X86_64Xmmr, xscratch: X86_64Xmmr) {
		match (op) {
			I32_TRUNC_F32_S => emit_i_trunc_f0(TRUNC_i32_f32_s, false, dst, x1, xscratch);
			I32_TRUNC_F32_U => emit_i_trunc_f0(TRUNC_i32_f32_u, false, dst, x1, xscratch);
			I32_TRUNC_F64_S => emit_i_trunc_f0(TRUNC_i32_f64_s, false, dst, x1, xscratch);
			I32_TRUNC_F64_U => emit_i_trunc_f0(TRUNC_i32_f64_u, false, dst, x1, xscratch);
			I64_TRUNC_F32_S => emit_i_trunc_f0(TRUNC_i64_f32_s, false, dst, x1, xscratch);
			I64_TRUNC_F32_U => emit_i_trunc_f0(TRUNC_i64_f32_u, false, dst, x1, xscratch);
			I64_TRUNC_F64_S => emit_i_trunc_f0(TRUNC_i64_f64_s, false, dst, x1, xscratch);
			I64_TRUNC_F64_U => emit_i_trunc_f0(TRUNC_i64_f64_u, false, dst, x1, xscratch);
			I32_TRUNC_SAT_F32_S => emit_i_trunc_f0(TRUNC_i32_f32_s, true, dst, x1, xscratch);
			I32_TRUNC_SAT_F32_U => emit_i_trunc_f0(TRUNC_i32_f32_u, true, dst, x1, xscratch);
			I32_TRUNC_SAT_F64_S => emit_i_trunc_f0(TRUNC_i32_f64_s, true, dst, x1, xscratch);
			I32_TRUNC_SAT_F64_U => emit_i_trunc_f0(TRUNC_i32_f64_u, true, dst, x1, xscratch);
			I64_TRUNC_SAT_F32_S => emit_i64_trunc_sat_f32_s(dst, x1, xscratch); // custom
			I64_TRUNC_SAT_F32_U => emit_i_trunc_f0(TRUNC_i64_f32_u, true, dst, x1, xscratch);
			I64_TRUNC_SAT_F64_S => emit_i64_trunc_sat_f64_s(dst, x1, xscratch); // custom
			I64_TRUNC_SAT_F64_U => emit_i_trunc_f0(TRUNC_i64_f64_u, true, dst, x1, xscratch);
			_ => unimplemented();
		}
	}
	private def emit_i64_trunc_sat_f32_s(dst: X86_64Gpr, x1: X86_64Xmmr, xscratch: X86_64Xmmr) {
		asm.movd_r_i(dst, int.view(Floats.f_1p63));
		asm.movd_s_r(xscratch, dst);
		asm.ucomiss_s_s(x1, xscratch);
		var is_nan = X86_64Label.new(), ovf_pos = X86_64Label.new(), done = X86_64Label.new();
		asm.jc_rel_near(C.P, is_nan);
		asm.jc_rel_near(C.NC, ovf_pos);
		asm.roundss_s_s(x1, x1, X86_64Rounding.TO_ZERO);
		asm.q.cvtss2si_r_s(dst, x1);
		asm.jmp_rel_near(done);
		asm.bind(is_nan);
		asm.movd_r_i(dst, 0);
		asm.jmp_rel_near(done);
		asm.bind(ovf_pos);
		asm.movq_r_i(dst, 0xFFFFFFFE);  // TODO: tricky constant
		asm.q.ror_r_i(dst, 1); // result = 0x7FFFFFFF_FFFFFFFF
		asm.bind(done);
	}
	private def emit_i64_trunc_sat_f64_s(dst: X86_64Gpr, x1: X86_64Xmmr, xscratch: X86_64Xmmr) {
		asm.movd_r_i(dst, int.view(Floats.d_1p63 >> 32));
		asm.q.shl_r_i(dst, 32);
		asm.movq_s_r(xscratch, dst);
		asm.ucomisd_s_s(x1, xscratch);
		var is_nan = X86_64Label.new(), ovf_pos = X86_64Label.new(), done = X86_64Label.new();
		asm.jc_rel_near(C.P, is_nan);
		asm.jc_rel_near(C.NC, ovf_pos);
		asm.roundsd_s_s(x1, x1, X86_64Rounding.TO_ZERO);
		asm.q.cvtsd2si_r_s(dst, x1);
		asm.jmp_rel_near(done);
		asm.bind(is_nan);
		asm.movd_r_i(dst, 0);
		asm.jmp_rel_near(done);
		asm.bind(ovf_pos);
		asm.movq_r_i(dst, 0xFFFFFFFE); // TODO: tricky constant
		asm.q.ror_r_i(dst, 1); // result = 0x7FFFFFFF_FFFFFFFF
		asm.bind(done);
	}
	private def emit_i_trunc_f0(config: FloatTrunc, saturate: bool, dst: X86_64Gpr, x1: X86_64Xmmr, xscratch: X86_64Xmmr) {
		config.mov_s_i(asm, xscratch, config.maxv, scratch);
		config.ucomi_s_s(asm, x1, xscratch);
		var trap = if(!saturate, X86_64MasmLabel.!(newTrapLabel(TrapReason.FLOAT_UNREPRESENTABLE)).label);
		var above = X86_64Label.new(), is_nan = X86_64Label.new(), below = X86_64Label.new();
		var done = X86_64Label.new();
		if (saturate) asm.jc_rel_near(C.P, is_nan);
		else asm.jc_rel_far(C.P, trap);
		if (saturate) asm.jc_rel_near(C.NC, above);
		else asm.jc_rel_far(C.NC, trap);
		var not_big = X86_64Label.new();

		if (config.isI64 && !config.isSigned) {
			// handle u64 convert of 1p63 < v <= 1p64
			config.mov_s_i(asm, xscratch, if(config.isF64, Floats.d_1p63, Floats.f_1p63), scratch);
			config.ucomi_s_s(asm, x1, xscratch);
			asm.jc_rel_near(C.C, not_big);
			config.sub_s_s(asm, x1, xscratch);
			config.round_s_s(asm, x1, x1, X86_64Rounding.TO_ZERO);
			config.cvt2si_r_s(asm.q, dst, x1);
			asm.movd_r_i(scratch, 1);
			asm.ror_r_i(scratch, 1);
			asm.q.add_r_r(dst, scratch);
			asm.jmp_rel_near(done);
		}
		asm.bind(not_big);

		if (!saturate || config.isI64 || !config.isSigned) {
			config.mov_s_i(asm, xscratch, config.minv, scratch);
			config.ucomi_s_s(asm, x1, xscratch);
			if (saturate) asm.jc_rel_near(C.NA, below); // v <= min
			else asm.jc_rel_far(C.NA, trap); // v <= min
		}

		config.round_s_s(asm, x1, x1, X86_64Rounding.TO_ZERO);
		if (!config.isI64 && config.isSigned) {
			config.cvt2si_r_s(asm.d, dst, x1);
		} else {
			config.cvt2si_r_s(asm.q, dst, x1);
		}
		if (saturate) {
			asm.jmp_rel_near(done);
			asm.bind(above);
			config.mov_r_i(asm, dst, config.ceilv);
			asm.jmp_rel_near(done);
			asm.bind(is_nan);
			asm.bind(below);
			asm.movd_r_i(dst, 0);
		}
		asm.bind(done);
	}
	def emit_f32_convert_i64_u(x1: X86_64Xmmr, r1: X86_64Gpr, xscratch: X86_64Xmmr, scratch: X86_64Gpr) {
		asm.q.cvtsi2ss_s_r(x1, r1);
		asm.q.cmp_r_i(r1, 0);
		var done = X86_64Label.new();
		asm.jc_rel_near(C.NS, done);
		// input < 0, compute 2.0d * cvt((x >> 1) | (x&1))
		asm.movq_r_r(scratch, r1);
		asm.q.and_r_i(scratch, 1);
		asm.q.shr_r_i(r1, 1);
		asm.q.or_r_r(r1, scratch);
		asm.q.cvtsi2ss_s_r(x1, r1);
		asm.movd_r_i(scratch, int.view(Floats.f_1p1)); // XXX: const could be in memory
		asm.movd_s_r(xscratch, scratch);
		asm.mulss_s_s(x1, xscratch);
		// done
		asm.bind(done);
	}
	def emit_f64_convert_i64_u(x1: X86_64Xmmr, r1: X86_64Gpr, xscratch: X86_64Xmmr, scratch: X86_64Gpr) {
		asm.q.cvtsi2sd_s_r(x1, r1);
		asm.q.cmp_r_i(r1, 0);
		var done = X86_64Label.new();
		asm.jc_rel_near(C.NS, done);
		// input < 0, compute 2.0d * cvt((x >> 1) | (x&1))
		asm.movq_r_r(scratch, r1);
		asm.q.and_r_i(scratch, 1);
		asm.q.shr_r_i(r1, 1);
		asm.q.or_r_r(r1, scratch);
		asm.q.cvtsi2sd_s_r(x1, r1);
		asm.movd_r_i(scratch, int.view(Floats.d_1p1 >> 32));
		asm.q.shl_r_i(scratch, 32);
		asm.movq_s_r(xscratch, scratch);
		asm.mulsd_s_s(x1, xscratch);
		// done
		asm.bind(done);
	}
	def emit_f32_copysign(x1: X86_64Xmmr, x2: X86_64Xmmr, scratch1: X86_64Gpr, scratch2: X86_64Gpr) {
		asm.movd_r_s(scratch2, x2);
		asm.d.and_r_i(scratch2, 0x80000000);
		asm.movd_r_s(scratch1, x1);
		asm.d.btr_r_i(scratch1, 31);
		asm.d.or_r_r(scratch1, scratch2);
		asm.movd_s_r(x1, scratch1);
	}
	def emit_f64_copysign(x1: X86_64Xmmr, x2: X86_64Xmmr, scratch1: X86_64Gpr, scratch2: X86_64Gpr) {
		asm.movq_r_s(scratch2, x2);
		asm.q.shr_r_i(scratch2, 63); // XXX: use shl, rcl, rcr?
		asm.q.shl_r_i(scratch2, 63);
		asm.movq_r_s(scratch1, x1);
		asm.q.btr_r_i(scratch1, 63);
		asm.q.or_r_r(scratch1, scratch2);
		asm.movq_s_r(x1, scratch1);
	}

	def emit_br(label: MasmLabel) {
		asm.jmp_rel_far(X86_64MasmLabel.!(label).label);
	}
	def emit_br_r(reg: Reg, cond: MasmBrCond, label: MasmLabel) {
		match (cond) {
			IS_WASM_FUNC, IS_NOT_WASM_FUNC => {
				asm.d.cmp_m_i(G(reg).plus(0), offsets.WasmFunction_typeId);
			}
			_ => {
				(if(cond.i32, asm.d, asm.q)).cmp_r_i(G(reg), 0);
			}
		}
		var cc = if(cond.zero, X86_64Conds.Z, X86_64Conds.NZ);
		asm.jc_rel_far(cc, X86_64MasmLabel.!(label).label);
	}
	def emit_br_m(addr: MasmAddr, cond: MasmBrCond, label: MasmLabel) {
		(if(cond.i32, asm.d, asm.q)).cmp_m_i(A(addr), 0);
		var cond = if(cond.zero, X86_64Conds.Z, X86_64Conds.NZ);
		asm.jc_rel_far(cond, X86_64MasmLabel.!(label).label);
	}
	def emit_breq_r_i(r: Reg, val: int, label: MasmLabel) {
		asm.d.cmp_r_i(G(r), val);
		asm.jc_rel_far(X86_64Conds.Z, X86_64MasmLabel.!(label).label);
	}
	def emit_breq_r_l(r: Reg, val: int, label: MasmLabel) {
		asm.q.cmp_r_i(G(r), val);
		asm.jc_rel_far(X86_64Conds.Z, X86_64MasmLabel.!(label).label);
	}
	def emit_brne_r_i(r: Reg, val: int, label: MasmLabel) {
		asm.d.cmp_r_i(G(r), val);
		asm.jc_rel_far(X86_64Conds.NZ, X86_64MasmLabel.!(label).label);
	}
	def emit_br_table_r(reg: Reg, labels: Array<MasmLabel>) {
		// XXX: simplify the label patching logic by improving X86_64Assembler
		var r1 = G(reg);
		asm.d.cmp_r_i(r1, labels.length);
		asm.jc_rel_far(C.NC, X86_64MasmLabel.!(labels[labels.length - 1]).label);
		var patcher = X86_64MasmJumpTablePatcher.new();
		asm.q.patcher = patcher;
		asm.q.lea(scratch, X86_64Addr.new(null, null, 1, REL_MARKER));
		asm.q.patcher = null;
		asm.ijmp_m(X86_64Addr.new(scratch, r1, 8, 0));
		w.align(8);
		var jtpos = w.atEnd().pos;
		if (jump_tables == null) jump_tables = Vector.new();
		jump_tables.put(jtpos, Arrays.map(labels, getLabel));
		w.skipN(labels.length * 8);
		w.at(patcher.pos).put_b32(jtpos - (patcher.pos + patcher.delta));
		w.atEnd();
	}
	def emit_call_abs(abs: Pointer) {
		// XXX: direct (relative) call if start is known and displacement is 32-bit.
		asm.movq_r_l(scratch, abs - Pointer.NULL);
		asm.icall_r(scratch);
	}
	def emit_call_r(reg: Reg) {
		asm.icall_r(G(reg));
		recordRetSourceLoc();
	}
	def emit_jump_r(reg: Reg) {
		asm.ijmp_r(G(reg));
	}
	def emit_increment_CountProbe(tmp: Reg, probe: CountProbe, increment: u64) {
		var r1 = G(tmp);
		var refOffset = asm.movq_r_p(r1, Pointer.atObject(probe) - Pointer.NULL);
		addEmbeddedRefOffset(refOffset);
		var addr = r1.plus(getOffsets().CountProbe_count);
		if (increment == 1) {
			asm.inc_m(addr);
		} else if (u31.?(increment)) {
			asm.add_m_i(addr, u31.!(increment));
		} else {
			var g = G(tmp);
			asm.movq_r_l(g, long.view(increment));
			asm.add_m_r(addr, g);
		}
	}
	def emit_call_OperandProbe_i_v_fire(probe: OperandProbe_i_v, value_reg: Reg) {
		var codePtr = CiRuntime.unpackClosure<OperandProbe_i_v, u32, void>(probe.fire_i).0;
		var refOffset = asm.movq_r_p(Target.V3_PARAM_GPRS[0], Pointer.atObject(probe) - Pointer.NULL);
		addEmbeddedRefOffset(refOffset);
		asm.movq_r_l(scratch, codePtr - Pointer.NULL); // XXX: make direct call to runtime if within 2GB
		asm.icall_r(scratch);
		recordRetSourceLoc();
	}
	def emit_call_HostCallStub() {
		var ic = X86_64PreGenStubs.getInterpreterCode();
		asm.movq_r_l(scratch, (ic.start + ic.header.hostCallStubOffset) - Pointer.NULL);
		asm.icall_r(scratch);
		recordRetSourceLoc();
	}
	def emit_jump_HostCallStub() {
		var ic = X86_64PreGenStubs.getInterpreterCode();
		asm.movq_r_l(scratch, (ic.start + ic.header.hostCallStubOffset) - Pointer.NULL);
		asm.ijmp_r(scratch);
	}
	def emit_call_runtime_callHost(func_arg: Reg) {
		emit_call_runtime(RT.runtime_callHost);
	}
	def emit_call_runtime_TRAP() {
		emit_call_runtime(RT.runtime_TRAP);
	}
	def emit_jump_to_trap_at(reason: TrapReason) {
		var ip = trap_stubs.getIpForReason(reason);
		asm.movq_r_l(scratch, ip - Pointer.NULL);
		asm.ijmp_r(scratch);
	}
	def emit_call_runtime_op(op: Opcode) {
		match (op) {
			THROW => emit_call_runtime(RT.runtime_THROW);
			GLOBAL_GET => emit_call_runtime(RT.runtime_GLOBAL_GET);
			GLOBAL_SET => emit_call_runtime(RT.runtime_GLOBAL_SET);
			TABLE_GET => emit_call_runtime(RT.runtime_TABLE_GET);
			TABLE_SET => emit_call_runtime(RT.runtime_TABLE_SET);
			MEMORY_GROW => emit_call_runtime(RT.runtime_MEMORY_GROW);
			MEMORY_INIT => emit_call_runtime(RT.runtime_MEMORY_INIT);
			MEMORY_COPY => emit_call_runtime(RT.runtime_MEMORY_COPY);
			MEMORY_FILL => emit_call_runtime(RT.runtime_MEMORY_FILL);
			TABLE_INIT => emit_call_runtime(RT.runtime_TABLE_INIT);
			TABLE_COPY => emit_call_runtime(RT.runtime_TABLE_COPY);
			TABLE_GROW => emit_call_runtime(RT.runtime_TABLE_GROW);
			TABLE_FILL => emit_call_runtime(RT.runtime_TABLE_FILL);
			_ => unimplemented();
		}
	}
	def emit_call_runtime_Probe_instr() {
		emit_call_runtime(RT.runtime_PROBE_instr);
	}
	private def emit_call_runtime<P, R>(closure: P -> R) {
		var ptr = CiRuntime.unpackClosure<X86_64Interpreter, P, R>(closure).0;
		// Do an absolute call into the runtime
		asm.movd_r_i(scratch, int.view(u32.!(ptr - Pointer.NULL))); // XXX: make direct call to runtime if within 2GB
		asm.icall_r(scratch);
		recordRetSourceLoc();
	}
	private def emit_jump_runtime<P, R>(closure: P -> R) {
		var ptr = CiRuntime.unpackClosure<X86_64Interpreter, P, R>(closure).0;
		// Do an absolute call into the runtime
		asm.movd_r_i(scratch, int.view(u32.!(ptr - Pointer.NULL))); // XXX: make direct jump to runtime if within 2GB
		asm.ijmp_r(scratch);
	}
	def emit_value_copy(r_dst: X86_64Gpr, r_src: X86_64Gpr, r_count: X86_64Gpr, r_xmm0: X86_64Xmmr) {
		var copy = X86_64Label.new();
		asm.d.shl_r_i(r_count, valuerep.slot_size_log);
		asm.bind(copy);
		if (valuerep.value_size == 16) { // 16-byte (SIMD) values
			asm.movdqu_s_m(r_xmm0, r_src.plusR(r_count, 1, - valuerep.value_size));
			asm.movdqu_m_s(r_dst.plusR(r_count, 1, - valuerep.value_size), r_xmm0);
		} else { // 8-byte values
			asm.movq_r_m(scratch, r_src.plusR(r_count, 1, - Pointer.SIZE));
			asm.movq_m_r(r_dst.plusR(r_count, 1, - Pointer.SIZE), scratch);
		}
		if (valuerep.tagged) {
			asm.movb_r_m(scratch, r_src.plusR(r_count, 1, - valuerep.slot_size)); // copy tags
			asm.movb_m_r(r_dst.plusR(r_count, 1, - valuerep.slot_size), scratch);
		}
		asm.d.sub_r_i(r_count, valuerep.slot_size);
		asm.jc_rel_near(C.NZ, copy);
	}

	def emit_store_curstack_vsp(vsp: Reg) {
		var offsets = getOffsets();
		asm.movq_r_m(scratch, absPointer(offsets.X86_64Runtime_curStack));
		asm.movq_m_r(scratch.plus(offsets.X86_64Stack_vsp), G(vsp));
	}
	def emit_load_curstack_vsp(vsp: Reg) {
		var offsets = getOffsets();
		asm.movq_r_m(scratch, absPointer(offsets.X86_64Runtime_curStack));
		asm.movq_r_m(G(vsp), scratch.plus(offsets.X86_64Stack_vsp));
	}
	def emit_load_dispatch_table_reg(reg: Reg) {
		var offsets = getOffsets();
		asm.movq_r_m(G(reg), absPointer(offsets.Interpreter_dispatchTable));
	}
	def emit_i32_clz_r_r(r: X86_64Gpr, s: X86_64Gpr) {
		asm.movd_r_i(scratch, -1);
		asm.d.bsr_r_r(r, s);
		asm.d.cmov_r(C.Z, r, scratch);
		asm.movd_r_i(scratch, 31);
		asm.d.sub_r_r(scratch, r);
		asm.movd_r_r(r, scratch); // XXX: can save an instruction here?
	}
	def emit_i32_ctz_r_r(r: X86_64Gpr, s: X86_64Gpr) {
		asm.d.bsf_r_r(r, s);
		asm.movd_r_i(scratch, 32);
		asm.d.cmov_r(C.Z, r, scratch);
	}
	def emit_i64_clz_r_r(r: X86_64Gpr, s: X86_64Gpr) {
		asm.movq_r_i(scratch, -1);
		asm.q.bsr_r_r(r, s);
		asm.q.cmov_r(C.Z, r, scratch);
		asm.movq_r_i(scratch, 63);
		asm.q.sub_r_r(scratch, r);
		asm.movq_r_r(r, scratch); // XXX: can save an instruction with second output reg
	}
	def emit_i64_ctz_r_r(r: X86_64Gpr, s: X86_64Gpr) {
		asm.q.bsf_r_r(r, s);
		asm.movq_r_i(scratch, 64);
		asm.q.cmov_r(C.Z, r, scratch);
	}
	def emit_i64_extend_i32_s(r: X86_64Gpr) {
		asm.q.shl_r_i(r, 32);
		asm.q.sar_r_i(r, 32);
	}
	def emit_i64_extend_i32_u(r: X86_64Gpr) {
		asm.movd_r_r(r, r);
	}
	// SSE assemblers and helpers
	// Masks for simd instructions
	def mask_i8x16_splat_0x0f: (u64, u64) = (0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F);
	def mask_i8x16_popcnt: (u64, u64) = (0x0302020102010100, 0x0403030203020201);
	def mask_v128_float_neg_constant: (u64, u64) = (0x8000000080000000, 0x8000000080000000);
	def mask_v128_double_neg_constant: (u64, u64) = (0x8000000000000000, 0x8000000000000000);
	def mask_v128_float_absolute_constant: (u64, u64) = (0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF);
	def mask_v128_double_absolute_constant: (u64, u64) = (0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF);
	def mask_f64x2_convert_low_i32x4_u_int: (u64, u64) = (0x4330000043300000, 0x4330000043300000);
	def mask_double_2_power_52: (u64, u64) = (0x4330000000000000, 0x4330000000000000);
	def mask_i8x16_splat_0x01: (u64, u64) = (0x0101010101010101, 0x0101010101010101);
	def mask_i16x8_splat_0x0001: (u64, u64) = (0x0001000100010001, 0x0001000100010001);
	def mask_i8x16_swizzle: (u64, u64) = (0x7070707070707070, 0x7070707070707070);
	def mask_int32_max_as_double: (u64, u64) = (0x41dfffffffc00000, 0x41dfffffffc00000);
	def mask_uint32_max_as_double: (u64, u64) = (0x41efffffffe00000, 0x41efffffffe00000);
	// Load a mask into an Xmm register s
	def load_v128_mask(dst: X86_64Xmmr, mask: (u64, u64), tmp: X86_64Gpr) {
		// move 64 bit wide general purpose register into an xmm's lower half
		asm.movq_r_l(tmp, long.view(mask.0));
		asm.pinsrq_s_r_i(dst, tmp, 0);
		asm.movq_r_l(tmp, long.view(mask.1));
		asm.pinsrq_s_r_i(dst, tmp, 1);
	}
	def emit_v128_orps(dst: X86_64Xmmr, src: X86_64Xmmr) {
		asm.orps_s_s(dst, src);
	}
	def emit_v128_xorps(dst: X86_64Xmmr, src: X86_64Xmmr) {
		asm.xorps_s_s(dst, src);
	}
	def emit_v128_andps(dst: X86_64Xmmr, src: X86_64Xmmr) {
		asm.andps_s_s(dst, src);
	}
	def emit_v128_neg<T>(dst: X86_64Xmmr, scratch: X86_64Xmmr, f: (X86_64Xmmr, X86_64Xmmr) -> T) {
		asm.pxor_s_s(scratch, scratch);
		f(scratch, dst);
		// todo: optimize out this move
		asm.movaps_s_s(dst, scratch);
	}
	def emit_i8x16_neg(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		emit_v128_neg(dst, scratch, asm.psubb_s_s);
	}
	def emit_i16x8_neg(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		emit_v128_neg(dst, scratch, asm.psubw_s_s);
	}
	def emit_i32x4_neg(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		emit_v128_neg(dst, scratch, asm.psubd_s_s);
	}
	def emit_i64x2_neg(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		emit_v128_neg(dst, scratch, asm.psubq_s_s);
  }
	def emit_v128_not(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.pcmpeqd_s_s(scratch, scratch);
		asm.xorps_s_s(dst, scratch);
	}
	def emit_v128_bitselect(dst: X86_64Xmmr, src: X86_64Xmmr, mask: X86_64Xmmr, scratch: X86_64Xmmr) {
		// AND(v1, c)
		asm.andpd_s_s(dst, mask);
		// NOT(c)
		emit_v128_not(mask, scratch);
		// AND(v2, NOT(c))
		asm.andpd_s_s(src, mask);
		// OR
		asm.orps_s_s(dst, src);
	}
	def emit_v128_negps(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_v128_float_neg_constant, tmp);
		emit_v128_xorps(dst, mask);
	}
	def emit_v128_negpd(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_v128_double_neg_constant, tmp);
		emit_v128_xorps(dst, mask);
	}
	def emit_v128_absps(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_v128_float_absolute_constant, tmp);
		emit_v128_andps(dst, mask);
	}
	def emit_v128_abspd(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_v128_double_absolute_constant, tmp);
		emit_v128_andps(dst, mask);
	}
	def emit_v128_zero(s: X86_64Xmmr) {
		emit_v128_xorps(s, s);
	}
	def emit_i64x2_mul(lhs: X86_64Xmmr, rhs: X86_64Xmmr, tmp1: X86_64Xmmr, tmp2: X86_64Xmmr) {
		asm.movaps_s_s(tmp1, lhs);
		asm.movaps_s_s(tmp2, rhs);
		// 1. Multiply high dword of each qword of left with right.
		asm.psrlq_i(tmp1, 32);
		asm.pmuludq_s_s(tmp1, rhs);
		// 2. Multiply high dword of each qword of right with left.
		asm.psrlq_i(tmp2, 32);
		asm.pmuludq_s_s(tmp2, lhs);
		// 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
		asm.paddq_s_s(tmp2, tmp1);
		asm.psllq_i(tmp2, 32);
		// 4. Multiply low dwords (this is the low dword of result).
		asm.pmuludq_s_s(lhs, rhs);
		// 5. Add 3 and 4.
		asm.paddq_s_s(lhs, tmp2);
	}
	def emit_v128_ne<T>(dst: X86_64Xmmr, src: X86_64Xmmr, f: (X86_64Xmmr, X86_64Xmmr) -> T) {
		f(dst, src);
		f(src, src);
		asm.pxor_s_s(dst, src);
	} 
	def emit_i8x16_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ne(dst, src, asm.pcmpeqb_s_s);
	}
	def emit_i16x8_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ne(dst, src, asm.pcmpeqw_s_s);
	}
	def emit_i32x4_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ne(dst, src, asm.pcmpeqd_s_s);
	}
	def emit_i64x2_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ne(dst, src, asm.pcmpeqq_s_s);
	}
	def emit_v128_gt_u<T>(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr,
		pmax: (X86_64Xmmr, X86_64Xmmr) -> T, pcmp: (X86_64Xmmr, X86_64Xmmr) -> T) {
		pmax(dst, src);
		pcmp(dst, src);
		pcmp(scratch, scratch);
		asm.xorps_s_s(dst, scratch);
	}
	def emit_i8x16_gt_u(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		emit_v128_gt_u(dst, src, scratch, asm.pmaxub_s_s, asm.pcmpeqb_s_s);
	}
	def emit_i16x8_gt_u(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		emit_v128_gt_u(dst, src, scratch, asm.pmaxuw_s_s, asm.pcmpeqw_s_s);
	}
	def emit_i32x4_gt_u(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		emit_v128_gt_u(dst, src, scratch, asm.pmaxud_s_s, asm.pcmpeqd_s_s);
	}
	def emit_v128_ge<T>(dst: X86_64Xmmr, src: X86_64Xmmr,
		pmin: (X86_64Xmmr, X86_64Xmmr) -> T, pcmp: (X86_64Xmmr, X86_64Xmmr) -> T) {
		pmin(dst, src);
		pcmp(dst, src);
	}
	def emit_i8x16_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ge(dst, src, asm.pminsb_s_s, asm.pcmpeqb_s_s);
	}
	def emit_i16x8_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ge(dst, src, asm.pminsw_s_s, asm.pcmpeqw_s_s);
	}
	def emit_i32x4_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ge(dst, src, asm.pminsd_s_s, asm.pcmpeqd_s_s);
	}
	def emit_i8x16_ge_u(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ge(dst, src, asm.pminub_s_s, asm.pcmpeqb_s_s);
	}
	def emit_i16x8_ge_u(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ge(dst, src, asm.pminuw_s_s, asm.pcmpeqw_s_s);
	}
	def emit_i32x4_ge_u(dst: X86_64Xmmr, src: X86_64Xmmr) {
		emit_v128_ge(dst, src, asm.pminud_s_s, asm.pcmpeqd_s_s);
	}
	def emit_i64x2_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.pcmpgtq_s_s(src, dst);
		asm.pcmpeqd_s_s(scratch, scratch);
		emit_v128_xorps(src, scratch);
		asm.movaps_s_s(dst, src);
	}
	def emit_i8x16_shl(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
		// Take shift value modulo 8.
		asm.movd_r_r(tmp1, shift);
		asm.and_r_i(tmp1, 0x7);
		asm.add_r_i(tmp1, 0x8);
		// Create a mask to unset high bits.
		asm.movd_s_r(tmp3, tmp1);
		asm.pcmpeqd_s_s(tmp2, tmp2);
		asm.psrlw_s_s(tmp2, tmp3);
		asm.packuswb_s_s(tmp2, tmp2);
		// Mask off the unwanted bits before word-shifting.
		emit_v128_andps(dst, tmp2);
		asm.add_r_i(tmp1, -0x8);
		asm.movd_s_r(tmp3, tmp1);
		asm.psllw_s_s(dst, tmp3);
	}
	def emit_i8x16_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
		// Unpack the bytes into words, do arithmetic shifts, and repack.
		asm.punpckhbw_s_s(tmp2, dst);
		asm.punpcklbw_s_s(dst, dst);
		// Prepare shift value
		asm.movd_r_r(tmp1, shift);
  		// Take shift value modulo 8.
		asm.and_r_i(tmp1, 0x7);
		asm.add_r_i(tmp1, 0x8);
		asm.movd_s_r(tmp3, tmp1);
		asm.psraw_s_s(tmp2, tmp3);
		asm.psraw_s_s(dst, tmp3);
		asm.packsswb_s_s(dst, tmp2);
	}
	def emit_i8x16_shr_u(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
		// Unpack the bytes into words, do arithmetic shifts, and repack.
		asm.punpckhbw_s_s(tmp2, dst);
		asm.punpcklbw_s_s(dst, dst);
		// Prepare shift value
		asm.movd_r_r(tmp1, shift);
  		// Take shift value modulo 8.
		asm.and_r_i(tmp1, 0x7);
		asm.add_r_i(tmp1, 0x8);
		asm.movd_s_r(tmp3, tmp1);
		asm.psrlw_s_s(tmp2, tmp3);
		asm.psrlw_s_s(dst, tmp3);
		asm.packuswb_s_s(dst, tmp2);
	}
	def emit_v128_shift<T>(dst: X86_64Xmmr, shift: X86_64Gpr, width: byte, gtmp: X86_64Gpr, xtmp: X86_64Xmmr,
			asm_pshfit_s_s: (X86_64Xmmr, X86_64Xmmr) -> T) {
		var mask = (1 << width) - 1;
		asm.movq_r_r(gtmp, shift);
		asm.and_r_i(gtmp, mask);
		asm.movq_s_r(xtmp, gtmp);
		asm_pshfit_s_s(dst, xtmp);
	}
	def emit_i64x2_abs(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.movshdup_s_s(scratch, dst);
		asm.psrad_i(scratch, 31);
		asm.xorps_s_s(dst, scratch);
		asm.psubq_s_s(dst, scratch);
	}
	def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp_shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr) {
		asm.pcmpeqd_s_s(xmm_tmp, xmm_tmp);
		asm.psllq_i(xmm_tmp, 63);
		// shift modulo 64
		asm.movd_r_r(tmp_shift, shift);
		asm.and_r_i(tmp_shift, 0x3f);
		asm.movd_s_r(xmm_shift, tmp_shift);
		emit_v128_xorps(dst, xmm_tmp);
		asm.psrlq_s_s(dst, xmm_shift);
		asm.psrlq_s_s(xmm_tmp, xmm_shift);
		asm.psubq_s_s(dst, xmm_tmp);
	}
	def emit_i16x8_extmul_low(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr, is_signed: bool) {
		if (is_signed) {
			asm.pmovsxbw_s_s(scratch, src);
			asm.pmovsxbw_s_s(dst, dst);
		} else {
			asm.pmovzxbw_s_s(scratch, src);
			asm.pmovzxbw_s_s(dst, dst);
		}
		asm.pmullw_s_s(dst, scratch);
	}
	def emit_i16x8_extmul_high_s(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.movaps_s_s(scratch, src);
		asm.punpckhbw_s_s(dst, dst);
		asm.psraw_i(dst, 8);
		asm.punpckhbw_s_s(scratch, scratch);
		asm.psraw_i(scratch, 8);
		asm.pmullw_s_s(dst, scratch);
	}
	def emit_i16x8_extmul_high_u(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.xorps_s_s(scratch, scratch);
		asm.punpckhbw_s_s(dst, scratch);
		asm.punpckhbw_s_s(scratch, src);
		asm.psrlw_i(scratch, 8);
		asm.pmullw_s_s(dst, scratch);
	}
	def emit_i32x4_extmul(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr, low: bool, is_signed: bool) {
		asm.movaps_s_s(scratch, dst);
		asm.pmullw_s_s(dst, src);
		if (is_signed) {
			asm.pmulhw_s_s(scratch, src);
		} else {
			asm.pmulhuw_s_s(scratch, src);
		}
		if (low) {
			asm.punpcklwd_s_s(dst, scratch);
		} else {
			asm.punpckhwd_s_s(dst, scratch);
		}
	}
	def emit_i64x2_extmul(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr, low: bool, is_signed: bool) {
		var mask: u8 = if(low, 0x50, 0xFA);
		asm.pshufd_s_s_i(scratch, src, mask);
		asm.pshufd_s_s_i(dst, dst, mask);
		if (is_signed) {
			asm.pmuldq_s_s(dst, scratch);
		} else {
			asm.pmuludq_s_s(dst, scratch);
		}
	}
	def emit_v128_anytrue(dst: X86_64Gpr, src: X86_64Xmmr) {
		asm.q.xor_r_r(dst, dst);
		asm.ptest_s_s(src, src);
		asm.set_r(C.NZ, dst);
	}
	private def emit_v128_alltrue<T>(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr, pcmp: (X86_64Xmmr, X86_64Xmmr) -> T) {
		asm.q.xor_r_r(dst, dst);
		asm.pxor_s_s(tmp, tmp);
		pcmp(tmp, src);
		asm.ptest_s_s(tmp, tmp);
		asm.set_r(C.Z, dst);
	}
	def emit_i8x16_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqb_s_s); }
	def emit_i16x8_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqw_s_s); }
	def emit_i32x4_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqd_s_s); }
	def emit_i64x2_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqq_s_s); }

	def emit_i16x8_bitmask(dst: X86_64Gpr, src: X86_64Xmmr) {
		asm.packsswb_s_s(src, src);
		asm.pmovmskb_r_s(dst, src);
		asm.shr_r_i(dst, 8);
	}
	def emit_i8x16_popcnt(dst: X86_64Xmmr, tmp1: X86_64Gpr, xtmp1: X86_64Xmmr, xtmp2: X86_64Xmmr, mask: X86_64Xmmr) {
		// load masks
		load_v128_mask(xtmp1, mask_i8x16_splat_0x0f, tmp1);
		load_v128_mask(mask, mask_i8x16_popcnt, tmp1);
		asm.movaps_s_s(xtmp2, xtmp1);
		// prepare temp variables
		asm.andps_s_s(xtmp1, dst);
		asm.andnps_s_s(xtmp2, dst);
		asm.psrlw_i(xtmp2, 4);
		// copy popcnt_mask to dst
		asm.movaps_s_s(dst, mask);
		asm.pshufb_s_s(dst, xtmp1);
		asm.movaps_s_s(xtmp1, mask);
		asm.pshufb_s_s(xtmp1, xtmp2);
		asm.paddb_s_s(dst, xtmp1);
	}
	def emit_f32x4_min(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		// The minps instruction doesn't propagate NaNs and +0's in its first
  		// operand. Perform minps in both orders, merge the results, and adjust.
		asm.movaps_s_s(scratch, src);
		asm.minps_s_s(scratch, dst);
		asm.minps_s_s(dst, src);
		// Propagate -0's and NaNs, which may be non-canonical.
		asm.orps_s_s(scratch, dst);
		// Canonicalize NaNs by quieting and clearing the payload.
		asm.cmpunordps_s_s(dst, scratch);
		asm.orps_s_s(scratch, dst);
		asm.psrld_i(dst, 10);
		asm.andnps_s_s(dst, scratch);
	}
	def emit_f32x4_max(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		// The maxps instruction doesn't propagate NaNs and +0's in its first
		// operand. Perform maxps in both orders, merge the results, and adjust.
		asm.movaps_s_s(scratch, src);
		asm.maxps_s_s(scratch, dst);
		asm.maxps_s_s(dst, src);
		// Find discrepancies.
		asm.xorps_s_s(dst, scratch);
		// Propagate -0's and NaNs, which may be non-canonical.
		asm.orps_s_s(scratch, dst);
		// Propagate sign discrepancy and (subtle) quiet NaNs.
		asm.subps_s_s(scratch, dst);
		// Canonicalize NaNs by quieting and clearing the payload.
		asm.cmpunordps_s_s(dst, scratch);
		asm.psrld_i(dst, 10);
		asm.andnps_s_s(dst, scratch);
	}
	def emit_f32x4_convert_i32x4_u(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.pxor_s_s(scratch, scratch);
		asm.pblendw_s_s_i(scratch, dst, 0x55);
		asm.psubd_s_s(dst, scratch);
		asm.cvtdq2ps_s_s(scratch, scratch);
		asm.psrld_i(dst, 1);
		asm.cvtdq2ps_s_s(dst, dst);
		asm.addps_s_s(dst, dst);
		asm.addps_s_s(dst, scratch);
	}
	def emit_f64x2_min(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.movaps_s_s(scratch, src);
		asm.minpd_s_s(scratch, dst);
		asm.minpd_s_s(dst, src);
		asm.orpd_s_s(scratch, dst);
		asm.cmpunordpd_s_s(dst, scratch);
		asm.orpd_s_s(scratch, dst);
		asm.psrlq_i(dst, 13);
		asm.andnpd_s_s(dst, scratch);
	}
	def emit_f64x2_max(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.movaps_s_s(scratch, src);
		asm.maxpd_s_s(scratch, dst);
		asm.maxpd_s_s(dst, src);
		asm.xorpd_s_s(dst, scratch);
		asm.orpd_s_s(scratch, dst);
		asm.subpd_s_s(scratch, dst);
		asm.cmpunordpd_s_s(dst, scratch);
		asm.psrlq_i(dst, 13);
		asm.andnpd_s_s(dst, scratch);
	}
	def emit_f64x2_convert_low_i32x4_u(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_f64x2_convert_low_i32x4_u_int, tmp);
		asm.unpcklps_s_s(dst, mask);
		load_v128_mask(mask, mask_double_2_power_52, tmp);
		asm.subpd_s_s(dst, mask);
	}
	def emit_i16x8_s_convert_i8x16_high(dst: X86_64Xmmr) {
		asm.movhlps_s_s(dst, dst);
		asm.pmovsxbw_s_s(dst, dst);
	}
	def emit_i16x8_u_convert_i8x16_high(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.xorps_s_s(scratch, scratch);
		asm.punpckhbw_s_s(dst, scratch);
	}
	def emit_i32x4_s_convert_i16x8_high(dst: X86_64Xmmr) {
		asm.movhlps_s_s(dst, dst);
		asm.pmovsxwd_s_s(dst, dst);
	}
	def emit_i32x4_u_convert_i16x8_high(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.xorps_s_s(scratch, scratch);
		asm.punpckhwd_s_s(dst, scratch);
	}
	def emit_i64x2_s_convert_i32x4_high(dst: X86_64Xmmr) {
		asm.movhlps_s_s(dst, dst);
		asm.pmovsxdq_s_s(dst, dst);
	}
	def emit_i64x2_u_convert_i32x4_high(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.xorps_s_s(scratch, scratch);
		asm.punpckhdq_s_s(dst, scratch);
	}
	def emit_i32x4_trunc_sat_f64x2_s_zero(dst: X86_64Xmmr, tmp: X86_64Gpr, scratch: X86_64Xmmr, mask: X86_64Xmmr) {
		asm.movaps_s_s(scratch, dst);
		asm.cmpeqpd_s_s(scratch, dst);
		load_v128_mask(mask, mask_int32_max_as_double, tmp);
		asm.andps_s_s(scratch, mask);
		asm.minpd_s_s(dst, scratch);
		asm.cvttpd2dq_s_s(dst, dst);
	}
	def emit_i32x4_trunc_sat_f64x2_u_zero(dst: X86_64Xmmr, tmp: X86_64Gpr, scratch: X86_64Xmmr, mask: X86_64Xmmr) {
		asm.xorps_s_s(scratch, scratch);
		asm.maxpd_s_s(dst, scratch);
		load_v128_mask(mask, mask_uint32_max_as_double, tmp);
		asm.minpd_s_s(dst, mask);
		asm.roundpd_s_s(dst, dst, X86_64Rounding.TO_ZERO);
		load_v128_mask(mask, mask_double_2_power_52, tmp);
		asm.addpd_s_s(dst, mask);
		asm.shufps_s_s_i(dst, scratch, 0x88);
	}
	def emit_i16x8_extadd_pairwise_i8x16_s(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_i8x16_splat_0x01, tmp);
		asm.pmaddubsw_s_s(mask, dst);
		asm.movaps_s_s(dst, mask);
	}
	def emit_i16x8_extadd_pairwise_i8x16_u(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_i8x16_splat_0x01, tmp);
		asm.pmaddubsw_s_s(dst, mask);
	}
	def emit_i32x4_extadd_pairwise_i16x8_s(dst: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		load_v128_mask(mask, mask_i16x8_splat_0x0001, tmp);
		asm.pmaddwd_s_s(dst, mask);
	}
	def emit_i32x4_trunc_sat_f32x4_s(dst: X86_64Xmmr, tmp: X86_64Xmmr) {
		// NAN -> 0
		asm.movaps_s_s(tmp, dst);
		asm.cmpeqps_s_s(tmp, tmp);
		asm.pand_s_s(dst, tmp);
		// set top bit if >= 0 (but not -0.0!)
		asm.pxor_s_s(tmp, dst);
		// convert
		asm.cvttps2dq_s_s(dst, dst);
		// set top bit if >=0 is now < 0
		asm.pand_s_s(tmp, dst);
		asm.psrad_i(tmp, 31);
		// set positive overflow lanes to 0x7FFFFFFF
		asm.pxor_s_s(dst, tmp);
	}
	def emit_i32x4_trunc_sat_f32x4_u(dst: X86_64Xmmr, tmp1: X86_64Xmmr, tmp2: X86_64Xmmr) {
		// NAN -> 0, negative -> 0
		asm.pxor_s_s(tmp2, tmp2);
		asm.maxps_s_s(dst, tmp2);
		// tmp2: float representation of max_signed
		asm.pcmpeqd_s_s(tmp2, tmp2);
		asm.psrld_i(tmp2, 1); // 0x7fffffff
		asm.cvtdq2ps_s_s(tmp2, tmp2); // 0x4f000000
		// tmp: convert (src-max_signed).
		// positive overflow lanes -> 0x7FFFFFFF
		// negative lanes -> 0
		asm.movaps_s_s(tmp1, dst);
		asm.subps_s_s(tmp1, tmp2);
		asm.cmpleps_s_s(tmp2, tmp1);
		asm.cvttps2dq_s_s(tmp1, tmp1);
		asm.pxor_s_s(tmp1, tmp2);
		asm.pxor_s_s(tmp2, tmp2);
		asm.pmaxsd_s_s(tmp1, tmp2);
		// convert. Overflow lanes above max_signed will be 0x80000000
		asm.cvttps2dq_s_s(dst, dst);
		// add (src-max_signed) for overflow lanes.
		asm.paddd_s_s(dst, tmp1);
	}
	def emit_i32x4_extadd_pairwise_i16x8_u(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.movaps_s_s(scratch, dst);
		asm.psrld_i(scratch, 16);
		asm.pblendw_s_s_i(dst, scratch, 0xAA);
		asm.paddd_s_s(dst, scratch);
	}
	def emit_i8x16_splat(dst: X86_64Xmmr, src: X86_64Gpr, scratch: X86_64Xmmr) {
		asm.movd_s_r(dst, src);
		asm.xorpd_s_s(scratch, scratch);
		asm.pshufb_s_s(dst, scratch);
	}
	def emit_i16x8_splat(dst: X86_64Xmmr, src: X86_64Gpr) {
		asm.movd_s_r(dst, src);
		asm.pshuflw_s_s_i(dst, dst, 0x0);
		asm.punpcklqdq_s_s(dst, dst);
	}
	def emit_i32x4_splat(dst: X86_64Xmmr, src: X86_64Gpr) {
		asm.movd_s_r(dst, src);
		asm.pshufd_s_s_i(dst, dst, 0x0);
	}
	def emit_i64x2_splat(dst: X86_64Xmmr, src: X86_64Gpr) {
		asm.movq_s_r(dst, src);
		asm.movddup_s_s(dst, dst);
	}
	def emit_f32x4_splat(dst: X86_64Xmmr, src: X86_64Gpr, scratch: X86_64Xmmr) {
		asm.movd_s_r(scratch, src);
		asm.shufps_s_s_i(dst, scratch, 0x0);
	}
	def emit_f64x2_splat(dst: X86_64Xmmr, src: X86_64Gpr, scratch: X86_64Xmmr) {
		asm.movq_s_r(scratch, src);
		asm.movddup_s_s(dst, scratch);
	}
	def emit_i8x16_swizzle(dst: X86_64Xmmr, src: X86_64Xmmr, tmp: X86_64Gpr, mask: X86_64Xmmr) {
		// Out-of-range indices should return 0, add 112 so that any value > 15
		// saturates to 128 (top bit set), so pshufb will zero that lane.
		load_v128_mask(mask, mask_i8x16_swizzle, tmp);
		asm.paddusb_s_s(mask, src);
		asm.pshufb_s_s(dst, mask);
	}
	def emit_i16x8_q15mulrsat_s(lhs: X86_64Xmmr, rhs: X86_64Xmmr, scratch: X86_64Xmmr) {
		asm.pcmpeqd_s_s(scratch, scratch);
		asm.psllw_i(scratch, 15);
		asm.pmulhrsw_s_s(lhs, rhs);
		asm.pcmpeqw_s_s(scratch, lhs);
		asm.pxor_s_s(lhs, scratch);
	}
	def emit_debugger_breakpoint() {
		asm.intK(3);
	}
	def getLabel(m: MasmLabel) -> X86_64Label {
		return X86_64MasmLabel.!(m).label;
	}
	def absPointer(ptr: Pointer) -> X86_64Addr {
		return X86_64Addr.new(null, null, 1, int.view(u32.!(ptr - Pointer.NULL)));
	}
	def getOffsets() -> V3Offsets {
		if (offsets == null) offsets = V3Offsets.new();
		return offsets;
	}
}

// XXX: Simplify relative loads for jump table by improving X86_64Assembler
def ABS_MARKER = 0x77665544;
def REL_MARKER = 0x99887766;
class X86_64MasmJumpTablePatcher extends X86_64AddrPatcher {
	var pos: int;
	var delta: int;
	new() super(ABS_MARKER, REL_MARKER) { }
	def recordRel32(pos: int, delta: int, addr: X86_64Addr) {
		this.pos = pos;
		this.delta = delta;
	}
}

// A utility that generates constants and picks appropriate instructions for rounding, data movement,
// and conversion in dealing with floating point truncations.
class FloatTrunc(isI64: bool, isF64: bool, isSigned: bool) {
	def round_s_s = if(isF64, X86_64Assembler.roundsd_s_s, X86_64Assembler.roundss_s_s);
	def sub_s_s = if(isF64, X86_64Assembler.subsd_s_s, X86_64Assembler.subss_s_s);
	def ucomi_s_s = if(isF64, X86_64Assembler.ucomisd_s_s, X86_64Assembler.ucomiss_s_s);
	def mov_s_r = if(isF64, X86_64Assembler.movq_s_r, X86_64Assembler.movd_s_r);
	def mov_m_s = if(isF64, X86_64Assembler.movsd_m_s, X86_64Assembler.movss_m_s);
	def maxv: u64 = if(isI64,
				if(isSigned,
					if(isF64, Floats.d_1p63, Floats.f_1p63),
					if(isF64, Floats.d_1p64, Floats.f_1p64)),
				if(isSigned,
					if(isF64, Floats.d_1p31, Floats.f_1p31),
					if(isF64, Floats.d_1p32, Floats.f_1p32)));
	def minv: u64 = if(isI64, // XXX: share these constants with V3 interpreter
				if(isSigned,
					if(isF64, u64.view(-9.223372036854778E18d), u32.view(-9.223373e18f)),
					if(isF64, u64.view(-1d), u32.view(-1f))),
				if(isSigned,
					if(isF64, u64.view(-2147483649d), u32.view(-2.1474839E9f)),
					if(isF64, u64.view(-1d), u32.view(-1f))));

	def minus1: u64 = if(isF64, Floats.d_minus1, Floats.f_minus1);

	def ceilv: u64 = if(isI64,
				if(isSigned, u63.max, u64.max),
				if(isSigned, u31.max, u32.max));
	def floorv: u64 = if(isSigned,
				if(isI64, u64.view(i63.min), u64.view(i31.min)));

	def tag = if(isI64, BpTypeCode.I64, BpTypeCode.I32).code;

	def mov_s_i(asm: X86_64Assembler, s: X86_64Xmmr, v: u64, scratch: X86_64Gpr) {
		if (isF64) {
			asm.movq_r_l(scratch, i64.view(v));
			asm.movq_s_r(s, scratch);
		} else {
			asm.movd_r_i(scratch, int.view(v));
			asm.movd_s_r(s, scratch);
		}
	}
	def mov_r_i(asm: X86_64Assembler, r: X86_64Gpr, v: u64) {
		if (isI64) {
			if (i32.view(v) == i64.view(v)) {
				asm.movq_r_i(r, int.view(v));
			} else if (u32.view(v) == u64.view(v)) {
				asm.movd_r_i(r, int.view(v));
			} else {
				System.error("FloatTrunc", "tricky 64-bit constant unimplemented");
			}
		} else {
			asm.movd_r_i(r, int.view(v));
		}
	}
	def cvt2si_r_s = if(isF64, X86_64Assembler.cvtsd2si_r_s, X86_64Assembler.cvtss2si_r_s);
}
def TRUNC_i32_f32_s = FloatTrunc.new(false, false, true);
def TRUNC_i32_f32_u = FloatTrunc.new(false, false, false);
def TRUNC_i32_f64_s = FloatTrunc.new(false, true, true);
def TRUNC_i32_f64_u = FloatTrunc.new(false, true, false);
def TRUNC_i64_f32_s = FloatTrunc.new(true, false, true);
def TRUNC_i64_f32_u = FloatTrunc.new(true, false, false);
def TRUNC_i64_f64_s = FloatTrunc.new(true, true, true);
def TRUNC_i64_f64_u = FloatTrunc.new(true, true, false);