//! Pulley binary code emission.
use super::*;
use crate::ir::{self, Endianness};
use crate::isa::pulley_shared::abi::PulleyMachineDeps;
use crate::isa::pulley_shared::PointerWidth;
use core::marker::PhantomData;
use cranelift_control::ControlPlane;
use pulley_interpreter::encode as enc;
use pulley_interpreter::regs::BinaryOperands;
pub struct EmitInfo {
#[allow(dead_code)] // Will get used as we fill out this backend.
shared_flags: settings::Flags,
#[allow(dead_code)] // Will get used as we fill out this backend.
isa_flags: crate::isa::pulley_shared::settings::Flags,
}
impl EmitInfo {
pub(crate) fn new(
shared_flags: settings::Flags,
isa_flags: crate::isa::pulley_shared::settings::Flags,
) -> Self {
Self {
shared_flags,
isa_flags,
}
}
fn endianness(&self, flags: MemFlags) -> Endianness {
let target_endianness = if self.isa_flags.big_endian() {
Endianness::Big
} else {
Endianness::Little
};
flags.endianness(target_endianness)
}
}
/// State carried between emissions of a sequence of instructions.
#[derive(Default, Clone, Debug)]
pub struct EmitState
where
P: PulleyTargetKind,
{
_phantom: PhantomData
,
ctrl_plane: ControlPlane,
user_stack_map: Option,
frame_layout: FrameLayout,
}
impl EmitState
where
P: PulleyTargetKind,
{
fn take_stack_map(&mut self) -> Option {
self.user_stack_map.take()
}
}
impl MachInstEmitState> for EmitState
where
P: PulleyTargetKind,
{
fn new(abi: &Callee>, ctrl_plane: ControlPlane) -> Self {
EmitState {
_phantom: PhantomData,
ctrl_plane,
user_stack_map: None,
frame_layout: abi.frame_layout().clone(),
}
}
fn pre_safepoint(&mut self, user_stack_map: Option) {
self.user_stack_map = user_stack_map;
}
fn ctrl_plane_mut(&mut self) -> &mut ControlPlane {
&mut self.ctrl_plane
}
fn take_ctrl_plane(self) -> ControlPlane {
self.ctrl_plane
}
fn frame_layout(&self) -> &FrameLayout {
&self.frame_layout
}
}
impl MachInstEmit for InstAndKind
where
P: PulleyTargetKind,
{
type State = EmitState
;
type Info = EmitInfo;
fn emit(&self, sink: &mut MachBuffer, emit_info: &Self::Info, state: &mut Self::State) {
// N.B.: we *must* not exceed the "worst-case size" used to compute
// where to insert islands, except when islands are explicitly triggered
// (with an `EmitIsland`). We check this in debug builds. This is `mut`
// to allow disabling the check for `JTSequence`, which is always
// emitted following an `EmitIsland`.
let mut start = sink.cur_offset();
pulley_emit(self, sink, emit_info, state, &mut start);
let end = sink.cur_offset();
assert!(
(end - start) <= InstAndKind::::worst_case_size(),
"encoded inst {self:?} longer than worst-case size: length: {}, Inst::worst_case_size() = {}",
end - start,
InstAndKind::
::worst_case_size()
);
}
fn pretty_print_inst(&self, state: &mut Self::State) -> String {
self.print_with_state(state)
}
}
/// Representation of a static offset from a pointer.
///
/// In VCode this is always represented as an `i32` and then just before
/// lowering this is used to determine which instruction to emit.
enum Offset {
/// An unsigned 8-bit offset.
U8(u8),
/// A signed 32-bit offset.
I32(i32),
}
impl From for Offset {
fn from(i: i32) -> Offset {
if let Ok(i) = i.try_into() {
return Offset::U8(i);
}
Offset::I32(i)
}
}
fn pulley_emit(
inst: &Inst,
sink: &mut MachBuffer>,
emit_info: &EmitInfo,
state: &mut EmitState,
start_offset: &mut u32,
) where
P: PulleyTargetKind,
{
match inst {
// Pseduo-instructions that don't actually encode to anything.
Inst::Args { .. } | Inst::Rets { .. } => {}
Inst::TrapIf { cond, code } => {
let trap = sink.defer_trap(*code);
let not_trap = sink.get_label();
>::from(Inst::BrIf {
cond: cond.clone(),
taken: trap,
not_taken: not_trap,
})
.emit(sink, emit_info, state);
sink.bind_label(not_trap, &mut state.ctrl_plane);
}
Inst::Nop => todo!(),
Inst::GetSpecial { dst, reg } => enc::xmov(sink, dst, reg),
Inst::LoadExtName { .. } => todo!(),
Inst::Call { info } => {
let offset = sink.cur_offset();
// If arguments happen to already be in the right register for the
// ABI then remove them from this list. Otherwise emit the
// appropriate `Call` instruction depending on how many arguments we
// have that aren't already in their correct register according to
// ABI conventions.
let mut args = &info.dest.args[..];
while !args.is_empty() && args.last().copied() == XReg::new(x_reg(args.len() - 1)) {
args = &args[..args.len() - 1];
}
match args {
[] => enc::call(sink, 0),
[x0] => enc::call1(sink, x0, 0),
[x0, x1] => enc::call2(sink, x0, x1, 0),
[x0, x1, x2] => enc::call3(sink, x0, x1, x2, 0),
[x0, x1, x2, x3] => enc::call4(sink, x0, x1, x2, x3, 0),
_ => unreachable!(),
}
let end = sink.cur_offset();
sink.add_reloc_at_offset(
end - 4,
// TODO: is it actually okay to reuse this reloc here?
Reloc::X86CallPCRel4,
&info.dest.name,
// This addend adjusts for the difference between the start of
// the instruction and the beginning of the immediate offset
// field which is always the final 4 bytes of the instruction.
-i64::from(end - offset - 4),
);
if let Some(s) = state.take_stack_map() {
let offset = sink.cur_offset();
sink.push_user_stack_map(state, offset, s);
}
sink.add_call_site();
let adjust = -i32::try_from(info.callee_pop_size).unwrap();
for i in PulleyMachineDeps::::gen_sp_reg_adjust(adjust) {
>::from(i).emit(sink, emit_info, state);
}
}
Inst::IndirectCall { info } => {
enc::call_indirect(sink, info.dest);
if let Some(s) = state.take_stack_map() {
let offset = sink.cur_offset();
sink.push_user_stack_map(state, offset, s);
}
sink.add_call_site();
let adjust = -i32::try_from(info.callee_pop_size).unwrap();
for i in PulleyMachineDeps::::gen_sp_reg_adjust(adjust) {
>::from(i).emit(sink, emit_info, state);
}
}
Inst::ReturnCall { info } => {
emit_return_call_common_sequence(sink, emit_info, state, &info);
// Emit an unconditional jump which is quite similar to `Inst::Call`
// except that a `jump` opcode is used instead of a `call` opcode.
sink.put1(pulley_interpreter::Opcode::Jump as u8);
sink.add_reloc(Reloc::X86CallPCRel4, &info.dest, -1);
sink.put4(0);
// Islands were manually handled in
// `emit_return_call_common_sequence`.
*start_offset = sink.cur_offset();
}
Inst::ReturnIndirectCall { info } => {
emit_return_call_common_sequence(sink, emit_info, state, &info);
enc::xjump(sink, info.dest);
// Islands were manually handled in
// `emit_return_call_common_sequence`.
*start_offset = sink.cur_offset();
}
Inst::IndirectCallHost { info } => {
// Emit a relocation to fill in the actual immediate argument here
// in `call_indirect_host`.
sink.add_reloc(Reloc::PulleyCallIndirectHost, &info.dest, 0);
enc::call_indirect_host(sink, 0_u8);
if let Some(s) = state.take_stack_map() {
let offset = sink.cur_offset();
sink.push_user_stack_map(state, offset, s);
}
sink.add_call_site();
// If a callee pop is happening here that means that something has
// messed up, these are expected to be "very simple" signatures.
assert!(info.callee_pop_size == 0);
}
Inst::Jump { label } => {
sink.use_label_at_offset(*start_offset + 1, *label, LabelUse::Jump(1));
sink.add_uncond_branch(*start_offset, *start_offset + 5, *label);
enc::jump(sink, 0x00000000);
}
Inst::BrIf {
cond,
taken,
not_taken,
} => {
// Encode the inverted form of the branch. Branches always have
// their trailing 4 bytes as the relative offset which is what we're
// going to target here within the `MachBuffer`.
let mut inverted = SmallVec::<[u8; 16]>::new();
cond.invert().encode(&mut inverted);
let len = inverted.len() as u32;
debug_assert!(len > 4);
// Use the `taken` label 4 bytes before the end of the instruction
// we're about to emit as that's the base of `PcRelOffset`. Note
// that the `Jump` here factors in the offset from the start of the
// instruction to the start of the relative offset, hence `len - 4`
// as the factor to adjust by.
let taken_end = *start_offset + len;
sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::Jump(len - 4));
sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted);
cond.encode(sink);
debug_assert_eq!(sink.cur_offset(), taken_end);
// For the not-taken branch use an unconditional jump to the
// relevant label, and we know that the jump instruction is 5 bytes
// long where the final 4 bytes are the offset to jump by.
let not_taken_start = taken_end + 1;
let not_taken_end = not_taken_start + 4;
sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::Jump(1));
sink.add_uncond_branch(taken_end, not_taken_end, *not_taken);
enc::jump(sink, 0x00000000);
assert_eq!(sink.cur_offset(), not_taken_end);
}
Inst::LoadAddr { dst, mem } => {
let base = mem.get_base_register();
let offset = mem.get_offset_with_state(state);
if let Some(base) = base {
if offset == 0 {
enc::xmov(sink, dst, base);
} else {
if let Ok(offset) = i8::try_from(offset) {
enc::xconst8(sink, dst, offset);
} else if let Ok(offset) = i16::try_from(offset) {
enc::xconst16(sink, dst, offset);
} else {
enc::xconst32(sink, dst, offset);
}
match P::pointer_width() {
PointerWidth::PointerWidth32 => {
enc::xadd32(sink, BinaryOperands::new(dst, base, dst))
}
PointerWidth::PointerWidth64 => {
enc::xadd64(sink, BinaryOperands::new(dst, base, dst))
}
}
}
} else {
unreachable!("all pulley amodes have a base register right now")
}
}
Inst::XLoad {
dst,
mem,
ty,
flags,
ext,
} => {
use Endianness as E;
use ExtKind as X;
let r = mem.get_base_register().unwrap();
let x = mem.get_offset_with_state(state);
let endian = emit_info.endianness(*flags);
match *ty {
I8 => match ext {
X::None | X::Zero32 => match x.into() {
Offset::I32(x) => enc::xload8_u32_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload8_u32_offset8(sink, dst, r, x),
},
X::Zero64 => match x.into() {
Offset::I32(x) => enc::xload8_u64_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload8_u64_offset8(sink, dst, r, x),
},
X::Sign32 => match x.into() {
Offset::I32(x) => enc::xload8_s32_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload8_s32_offset8(sink, dst, r, x),
},
X::Sign64 => match x.into() {
Offset::I32(x) => enc::xload8_s64_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload8_s64_offset8(sink, dst, r, x),
},
},
I16 => match (ext, endian) {
(X::None | X::Zero32, E::Little) => match x.into() {
Offset::I32(x) => enc::xload16le_u32_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload16le_u32_offset8(sink, dst, r, x),
},
(X::Sign32, E::Little) => match x.into() {
Offset::I32(x) => enc::xload16le_s32_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload16le_s32_offset8(sink, dst, r, x),
},
(X::Zero64, E::Little) => match x.into() {
Offset::I32(x) => enc::xload16le_u64_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload16le_u64_offset8(sink, dst, r, x),
},
(X::Sign64, E::Little) => match x.into() {
Offset::I32(x) => enc::xload16le_s64_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload16le_s64_offset8(sink, dst, r, x),
},
(X::None | X::Zero32 | X::Zero64, E::Big) => {
enc::xload16be_u64_offset32(sink, dst, r, x);
}
(X::Sign32 | X::Sign64, E::Big) => {
enc::xload16be_s64_offset32(sink, dst, r, x);
}
},
I32 => match (ext, endian) {
(X::None | X::Zero32 | X::Sign32, E::Little) => match x.into() {
Offset::I32(x) => enc::xload32le_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload32le_offset8(sink, dst, r, x),
},
(X::Zero64, E::Little) => match x.into() {
Offset::I32(x) => enc::xload32le_u64_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload32le_u64_offset8(sink, dst, r, x),
},
(X::Sign64, E::Little) => match x.into() {
Offset::I32(x) => enc::xload32le_s64_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload32le_s64_offset8(sink, dst, r, x),
},
(X::None | X::Zero32 | X::Zero64, E::Big) => {
enc::xload32be_u64_offset32(sink, dst, r, x);
}
(X::Sign32 | X::Sign64, E::Big) => {
enc::xload32be_s64_offset32(sink, dst, r, x);
}
},
I64 => match endian {
E::Little => match x.into() {
Offset::I32(x) => enc::xload64le_offset32(sink, dst, r, x),
Offset::U8(x) => enc::xload64le_offset8(sink, dst, r, x),
},
E::Big => enc::xload64be_offset32(sink, dst, r, x),
},
_ => unimplemented!("xload ty={ty:?}"),
}
}
Inst::FLoad {
dst,
mem,
ty,
flags,
} => {
use Endianness as E;
let r = mem.get_base_register().unwrap();
let x = mem.get_offset_with_state(state);
let endian = emit_info.endianness(*flags);
match *ty {
F32 => match endian {
E::Little => enc::fload32le_offset32(sink, dst, r, x),
E::Big => enc::fload32be_offset32(sink, dst, r, x),
},
F64 => match endian {
E::Little => enc::fload64le_offset32(sink, dst, r, x),
E::Big => enc::fload64be_offset32(sink, dst, r, x),
},
_ => unimplemented!("fload ty={ty:?}"),
}
}
Inst::VLoad {
dst,
mem,
ty,
flags,
ext,
} => {
let r = mem.get_base_register().unwrap();
let x = mem.get_offset_with_state(state);
let endian = emit_info.endianness(*flags);
assert_eq!(endian, Endianness::Little);
assert_eq!(ty.bytes(), 16);
match ext {
VExtKind::None => enc::vload128le_offset32(sink, dst, r, x),
VExtKind::S8x8 => enc::vload8x8_s_offset32(sink, dst, r, x),
VExtKind::U8x8 => enc::vload8x8_u_offset32(sink, dst, r, x),
VExtKind::S16x4 => enc::vload16x4le_s_offset32(sink, dst, r, x),
VExtKind::U16x4 => enc::vload16x4le_u_offset32(sink, dst, r, x),
VExtKind::S32x2 => enc::vload32x2le_s_offset32(sink, dst, r, x),
VExtKind::U32x2 => enc::vload32x2le_u_offset32(sink, dst, r, x),
}
}
Inst::XStore {
mem,
src,
ty,
flags,
} => {
use Endianness as E;
let r = mem.get_base_register().unwrap();
let x = mem.get_offset_with_state(state);
let endian = emit_info.endianness(*flags);
match *ty {
I8 => match x.into() {
Offset::I32(x) => enc::xstore8_offset32(sink, r, x, src),
Offset::U8(x) => enc::xstore8_offset8(sink, r, x, src),
},
I16 => match endian {
E::Little => match x.into() {
Offset::I32(x) => enc::xstore16le_offset32(sink, r, x, src),
Offset::U8(x) => enc::xstore16le_offset8(sink, r, x, src),
},
E::Big => enc::xstore16be_offset32(sink, r, x, src),
},
I32 => match endian {
E::Little => match x.into() {
Offset::I32(x) => enc::xstore32le_offset32(sink, r, x, src),
Offset::U8(x) => enc::xstore32le_offset8(sink, r, x, src),
},
E::Big => enc::xstore32be_offset32(sink, r, x, src),
},
I64 => match endian {
E::Little => match x.into() {
Offset::I32(x) => enc::xstore64le_offset32(sink, r, x, src),
Offset::U8(x) => enc::xstore64le_offset8(sink, r, x, src),
},
E::Big => enc::xstore64be_offset32(sink, r, x, src),
},
_ => unimplemented!("xstore ty={ty:?}"),
}
}
Inst::FStore {
mem,
src,
ty,
flags,
} => {
use Endianness as E;
let r = mem.get_base_register().unwrap();
let x = mem.get_offset_with_state(state);
let endian = emit_info.endianness(*flags);
match *ty {
F32 => match endian {
E::Little => enc::fstore32le_offset32(sink, r, x, src),
E::Big => enc::fstore32be_offset32(sink, r, x, src),
},
F64 => match endian {
E::Little => enc::fstore64le_offset32(sink, r, x, src),
E::Big => enc::fstore64be_offset32(sink, r, x, src),
},
_ => unimplemented!("fstore ty={ty:?}"),
}
}
Inst::VStore {
mem,
src,
ty,
flags,
} => {
let r = mem.get_base_register().unwrap();
let x = mem.get_offset_with_state(state);
let endian = emit_info.endianness(*flags);
assert_eq!(endian, Endianness::Little);
assert_eq!(ty.bytes(), 16);
enc::vstore128le_offset32(sink, r, x, src);
}
Inst::BrTable {
idx,
default,
targets,
} => {
// Encode the `br_table32` instruction directly which expects the
// next `amt` 4-byte integers to all be relative offsets. Each
// offset is the pc-relative offset of the branch destination.
//
// Pulley clamps the branch targets to the `amt` specified so the
// final branch target is the default jump target.
//
// Note that this instruction may have many branch targets so it
// manually checks to see if an island is needed. If so we emit a
// jump around the island before the `br_table32` itself gets
// emitted.
let amt = u32::try_from(targets.len() + 1).expect("too many branch targets");
let br_table_size = amt * 4 + 6;
if sink.island_needed(br_table_size) {
let label = sink.get_label();
>::from(Inst::Jump { label }).emit(sink, emit_info, state);
sink.emit_island(br_table_size, &mut state.ctrl_plane);
sink.bind_label(label, &mut state.ctrl_plane);
}
enc::br_table32(sink, *idx, amt);
for target in targets.iter() {
let offset = sink.cur_offset();
sink.use_label_at_offset(offset, *target, LabelUse::Jump(0));
sink.put4(0);
}
let offset = sink.cur_offset();
sink.use_label_at_offset(offset, *default, LabelUse::Jump(0));
sink.put4(0);
// We manually handled `emit_island` above when dealing with
// `island_needed` so update the starting offset to the current
// offset so this instruction doesn't accidentally trigger
// the assertion that we're always under worst-case-size.
*start_offset = sink.cur_offset();
}
Inst::Raw { raw } => {
match raw {
RawInst::PushFrame
| RawInst::StackAlloc32 { .. }
| RawInst::PushFrameSave { .. } => {
sink.add_trap(ir::TrapCode::STACK_OVERFLOW);
}
_ => {}
}
super::generated::emit(raw, sink)
}
}
}
fn emit_return_call_common_sequence(
sink: &mut MachBuffer>,
emit_info: &EmitInfo,
state: &mut EmitState,
info: &ReturnCallInfo,
) where
P: PulleyTargetKind,
{
// The return call sequence can potentially emit a lot of instructions, so
// lets emit an island here if we need it.
//
// It is difficult to calculate exactly how many instructions are going to
// be emitted, so we calculate it by emitting it into a disposable buffer,
// and then checking how many instructions were actually emitted.
let mut buffer = MachBuffer::new();
let mut fake_emit_state = state.clone();
return_call_emit_impl(&mut buffer, emit_info, &mut fake_emit_state, info);
// Finalize the buffer and get the number of bytes emitted.
let buffer = buffer.finish(&Default::default(), &mut Default::default());
let length = buffer.data().len() as u32;
// And now emit the island inline with this instruction.
if sink.island_needed(length) {
let jump_around_label = sink.get_label();
>::gen_jump(jump_around_label).emit(sink, emit_info, state);
sink.emit_island(length + 4, &mut state.ctrl_plane);
sink.bind_label(jump_around_label, &mut state.ctrl_plane);
}
// Now that we're done, emit the *actual* return sequence.
return_call_emit_impl(sink, emit_info, state, info);
}
/// This should not be called directly, Instead prefer to call [emit_return_call_common_sequence].
fn return_call_emit_impl(
sink: &mut MachBuffer>,
emit_info: &EmitInfo,
state: &mut EmitState,
info: &ReturnCallInfo,
) where
P: PulleyTargetKind,
{
let sp_to_fp_offset = {
let frame_layout = state.frame_layout();
i64::from(
frame_layout.clobber_size
+ frame_layout.fixed_frame_storage_size
+ frame_layout.outgoing_args_size,
)
};
// Restore all clobbered registers before leaving the function.
let mut clobber_offset = sp_to_fp_offset - 8;
for reg in state.frame_layout().clobbered_callee_saves.clone() {
let rreg = reg.to_reg();
let ty = match rreg.class() {
RegClass::Int => I64,
RegClass::Float => F64,
RegClass::Vector => unimplemented!("Vector Clobber Restores"),
};
>::from(Inst::gen_load(
reg.map(Reg::from),
Amode::SpOffset {
offset: clobber_offset.try_into().unwrap(),
},
ty,
MemFlags::trusted(),
))
.emit(sink, emit_info, state);
clobber_offset -= 8
}
// Restore the link register and frame pointer using a `pop_frame`
// instruction. This will move `sp` to the current frame pointer and then
// restore the old lr/fp, so this restores all of sp/fp/lr in one
// instruction.
let setup_area_size = i64::from(state.frame_layout().setup_area_size);
assert!(setup_area_size > 0, "must have frame pointers enabled");
>::from(RawInst::PopFrame).emit(sink, emit_info, state);
// Now that `sp` is restored to what it was on function entry it may need to
// be adjusted if the stack arguments of our own function differ from the
// stack arguments of the callee. Perform any necessary adjustment here.
//
// Note that this means that there's a brief window where stack arguments
// might be below `sp` in the case that the callee has more stack arguments
// than ourselves. That's in theory ok though as we're inventing the pulley
// ABI and nothing like async signals are happening that we have to worry
// about.
let incoming_args_diff =
i64::from(state.frame_layout().tail_args_size - info.new_stack_arg_size);
if incoming_args_diff != 0 {
let amt = i32::try_from(incoming_args_diff).unwrap();
for inst in PulleyMachineDeps::::gen_sp_reg_adjust(amt) {
>::from(inst).emit(sink, emit_info, state);
}
}
}