Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use relative call instructions between wasm functions #3254

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,36 @@ pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
}
}

/// Generates the four instructions necessary for a small "jump veneer" which
/// is used when relative 26-bit call instructions won't cut it and a longer
/// jump is needed.
///
/// This generates:
///
/// ```ignore
/// ldr x16, 16
/// adr x17, 12
/// add x16, x16, x17
/// br x16
/// ```
///
/// and the expectation is that the 8-byte immediate address to jump to is
/// located after these instructions are encoded.
///
/// Note that this is part of the `MachBackend::gen_jump_veneer` contract.
pub fn gen_jump_veneer() -> (u32, u32, u32, u32) {
(
// ldr x16, 16
enc_ldst_imm19(0b01011000, 16 / 4, xreg(16)),
// adr x17, 12
enc_adr(12, writable_xreg(17)),
// add x16, x16, x17
enc_arith_rrr(0b10001011_000, 0, writable_xreg(16), xreg(16), xreg(17)),
// br x16
enc_br(xreg(16)),
)
}

/// Memory addressing mode finalization: convert "special" modes (e.g.,
/// generic arbitrary stack offset) into real addressing modes, possibly by
/// emitting some helper instructions that come immediately before the use
Expand Down
17 changes: 17 additions & 0 deletions cranelift/codegen/src/isa/aarch64/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,23 @@ impl MachBackend for AArch64Backend {
fn create_systemv_cie(&self) -> Option<gimli::write::CommonInformationEntry> {
Some(inst::unwind::systemv::create_cie())
}

fn max_jump_veneer_size(&self) -> usize {
24 // 4 insns + 8-byte immediate
}

fn generate_jump_veneer(&self) -> (Vec<u8>, usize) {
let (a, b, c, d) = inst::emit::gen_jump_veneer();
let mut bytes = Vec::with_capacity(self.max_jump_veneer_size());
bytes.extend_from_slice(&a.to_le_bytes());
bytes.extend_from_slice(&b.to_le_bytes());
bytes.extend_from_slice(&c.to_le_bytes());
bytes.extend_from_slice(&d.to_le_bytes());
let imm_start = bytes.len();
bytes.extend_from_slice(&[0x00; 8]);
assert_eq!(bytes.len(), self.max_jump_veneer_size());
(bytes, imm_start)
}
}

/// Create a new `isa::Builder`.
Expand Down
8 changes: 8 additions & 0 deletions cranelift/codegen/src/isa/arm32/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ impl MachBackend for Arm32Backend {
// Carry flag clear.
IntCC::UnsignedLessThan
}

fn generate_jump_veneer(&self) -> (Vec<u8>, usize) {
panic!("not implemented for arm32 backend")
}

fn max_jump_veneer_size(&self) -> usize {
0
}
}

/// Create a new `isa::Builder`.
Expand Down
11 changes: 11 additions & 0 deletions cranelift/codegen/src/isa/s390x/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,17 @@ impl MachBackend for S390xBackend {
fn map_reg_to_dwarf(&self, reg: Reg) -> Result<u16, RegisterMappingError> {
inst::unwind::systemv::map_reg(reg).map(|reg| reg.0)
}

fn max_jump_veneer_size(&self) -> usize {
0
}

fn generate_jump_veneer(&self) -> (Vec<u8>, usize) {
panic!(
"jumps >= 2gb are not implemented yet on s390x, functions are \
too far apart to have a relative call between them"
);
}
}

/// Create a new `isa::Builder`.
Expand Down
57 changes: 54 additions & 3 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ use crate::ir::LibCall;
use crate::ir::TrapCode;
use crate::isa::x64::encoding::evex::{EvexInstruction, EvexVectorLength};
use crate::isa::x64::encoding::rex::{
emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
RexFlags,
emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg,
encode_modrm, int_reg_enc, low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc,
LegacyPrefixes, OpcodeMap, RexFlags,
};
use crate::isa::x64::inst::args::*;
use crate::isa::x64::inst::*;
Expand Down Expand Up @@ -56,6 +56,57 @@ fn emit_reloc(
sink.add_reloc(srcloc, kind, name, addend);
}

/// Generates the instructions necessary for a small "jump veneer" which is
/// used when relative 32-bit call instructions won't cut it and a longer jump
/// is needed.
///
/// This generates:
///
/// ```ignore
/// movabsq $val, %r10
/// lea -15(%rip), %r11
/// add %r10, %r11
/// jmpq *%r11
/// ```
///
/// Note that this is part of the `MachBackend::gen_jump_veneer` contract.
pub fn gen_jump_veneer() -> (Vec<u8>, usize) {
let mut bytes = Vec::with_capacity(jump_veneer_size());

let r10 = int_reg_enc(regs::r10());
let r11 = int_reg_enc(regs::r11());

// movabsq $val, %r10
bytes.push(0x48 | ((r10 >> 3) & 1));
bytes.push(0xB8 | (r10 & 7));
let imm_pos = bytes.len();
bytes.extend_from_slice(&[0; 8]);

// lea -15(%rip), %r11
bytes.push(0x48 | ((r11 >> 3) & 1) << 2);
bytes.push(0x8d);
bytes.push(encode_modrm(0b00, r11 & 7, 0b101));
bytes.extend_from_slice(&i32::to_le_bytes(-15));

// add %r10, %r11
bytes.push(0x48 | (((r11 >> 3) & 1) << 2) | ((r10 >> 3) & 1));
bytes.push(0x01);
bytes.push(encode_modrm(0b11, r10 & 7, r11 & 7));

// jmpq *%r11
bytes.push(0x40 | ((r11 >> 3) & 1));
bytes.push(0xff);
bytes.push(0xe0 | (r11 & 7));

assert_eq!(bytes.len(), jump_veneer_size());
(bytes, imm_pos)
}

/// See `gen_jump_veneer`.
pub fn jump_veneer_size() -> usize {
23
}

/// The top-level emit function.
///
/// Important! Do not add improved (shortened) encoding cases to existing
Expand Down
2 changes: 2 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ pub mod unwind;
use args::*;
use regs::{create_reg_universe_systemv, show_ireg_sized};

pub use emit::{gen_jump_veneer, jump_veneer_size};

//=============================================================================
// Instructions (top level): definition

Expand Down
8 changes: 8 additions & 0 deletions cranelift/codegen/src/isa/x64/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ impl MachBackend for X64Backend {
fn map_reg_to_dwarf(&self, reg: Reg) -> Result<u16, systemv::RegisterMappingError> {
inst::unwind::systemv::map_reg(reg).map(|reg| reg.0)
}

fn max_jump_veneer_size(&self) -> usize {
inst::jump_veneer_size()
}

fn generate_jump_veneer(&self) -> (Vec<u8>, usize) {
inst::gen_jump_veneer()
}
}

/// Create a new `isa::Builder`.
Expand Down
28 changes: 28 additions & 0 deletions cranelift/codegen/src/machinst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,34 @@ pub trait MachBackend {
fn map_reg_to_dwarf(&self, _: Reg) -> Result<u16, RegisterMappingError> {
Err(RegisterMappingError::UnsupportedArchitecture)
}

/// Generates as "veneer" which is used when a relative call instruction
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two thoughts on this new addition to the backend trait:

  • I think there might be an opportunity to merge this per-backend behavior into the LabelUse trait already defined by each backend. The generated trampoline code is playing almost the same role -- extending the range of a branch and allowing a shorter-range reloc to be converted to a longer-range one -- except that, unlike intra-function veneers, it also assumes the use of various registers.

    That last bit is a key difference. In the aarch64 case x16 and x17 are used internally to instruction-lowering sequences but never across basic blocks, so this is fine; but r10 and r11 on x86-64 will potentially be used by regalloc, so we wouldn't want to blindly insert this as another type of veneer in the existing trait. So we'd want to add some parameters to the supports_veneer, veneer_size and generate_veneer functions to indicate what kind of veneer ("next step up in range" or "absolute" maybe) and whether the use of regs as per ABI for inter-function transfers is allowed.

    Whatever we do, it strikes me that the duplication here ("veneers" in two places, with similar APIs) is likely to confuse others so we should somehow merge them or distinguish them better. Furthermore if we're going to have low-level understanding of branches (e.g. embedded machine-code bits in a sequence to emit) we should have that in as few places as possible.

  • I am wondering if there is a better name than "veneer", if we don't merge; maybe "trampoline" or "linkage stub"?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function could be useful for cranelift-jit too. It also needs a version that loads the address from memory at a specific location like ELF PLT's. Currently it only has a x86_64 version for the latter function.

/// cannot reach to the destination.
///
/// Cranelift compiles wasm modules on a per-function basis entirely
/// isolated from all other functions. Functions also, ideally, use relative
/// calls between them to avoid needing relocation fixups when a module is
/// loaded and also having statically more predictable calls. These jumps,
/// however, may not always be able to reach the destination depending on
/// the final layout of the executable.
///
/// This function is used to generate an executable code sequence which can
/// be used to jump to an arbitrary pointer-sized immediate. This is
/// only used when functions are too far apart to call each other with
/// relative call instructions.
///
/// The first return of this function is the machine code of the veneer, and
/// the second argument is the offset, within the veneer, where an 8-byte
/// immediate needs to be written of the target destination. The veneer,
/// when jumped to, will add the 8-byte immediate to the address of the
/// 8-byte immediate and jump to that location. This means that the veneer
/// will do a relative jump to the final location, and the relative jump
/// uses a pointer-sized immediate to make the jump.
fn generate_jump_veneer(&self) -> (Vec<u8>, usize);

/// Returns the maximal size of the veneer returned by
/// `generate_jump_veneer`.
fn max_jump_veneer_size(&self) -> usize;
}

/// Expected unwind info type.
Expand Down
28 changes: 27 additions & 1 deletion crates/cranelift/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ use wasmtime_environ::{CompilerBuilder, Setting, SettingKind};
struct Builder {
flags: settings::Builder,
isa_flags: isa::Builder,
linkopts: LinkOptions,
}

#[derive(Clone, Default)]
pub struct LinkOptions {
/// A debug-only setting used to synthetically insert 0-byte padding between
/// compiled functions to simulate huge compiled artifacts and exercise
/// logic related to jump veneers.
pub padding_between_functions: usize,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this if we have the option below (force veneers) as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to leave this in for now because I think it's useful to exercise the veneers in real-world situations where they're actually needed. Otherwise I'd be worried that the logic for actually inserting veneers was only correct when the forcing was turned on. This way there's some exercising of the actual "ok yes we need that veneer" logic as well. (it's also relatively easy to support).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense!

Would it make sense to use it only in tests with the aarch64 backend (could be instantiated explicitly, doesn't need to run on aarch64 host), where the threshold for inserting an island is much lower, so we don't have the overhead of 2GiB object files in tests on x86?


/// A debug-only setting used to force inter-function calls in a wasm module
/// to always go through "jump veneers" which are typically only generated
/// when functions are very far from each other.
pub force_jump_veneers: bool,
}

pub fn builder() -> Box<dyn CompilerBuilder> {
Expand All @@ -32,6 +46,7 @@ pub fn builder() -> Box<dyn CompilerBuilder> {
Box::new(Builder {
flags,
isa_flags: cranelift_native::builder().expect("host machine is not a supported target"),
linkopts: LinkOptions::default(),
})
}

Expand All @@ -50,6 +65,17 @@ impl CompilerBuilder for Builder {
}

fn set(&mut self, name: &str, value: &str) -> Result<()> {
// Special wasmtime-cranelift-only settings first
if name == "wasmtime_linkopt_padding_between_functions" {
self.linkopts.padding_between_functions = value.parse()?;
return Ok(());
}
if name == "wasmtime_linkopt_force_jump_veneer" {
self.linkopts.force_jump_veneers = value.parse()?;
return Ok(());
}

// ... then forward this to Cranelift
if let Err(err) = self.flags.set(name, value) {
match err {
SetError::BadName(_) => {
Expand Down Expand Up @@ -80,7 +106,7 @@ impl CompilerBuilder for Builder {
.isa_flags
.clone()
.finish(settings::Flags::new(self.flags.clone()));
Box::new(crate::compiler::Compiler::new(isa))
Box::new(crate::compiler::Compiler::new(isa, self.linkopts.clone()))
}

fn settings(&self) -> Vec<Setting> {
Expand Down
Loading