//! Riscv64 ISA: binary code emission.

use crate::ir::{self, LibCall, TrapCode};
use crate::isa::riscv64::inst::*;
use crate::isa::riscv64::lower::isle::generated_code::{
    CaOp, CbOp, CiOp, CiwOp, ClOp, CrOp, CsOp, CssOp, CsznOp, FpuOPWidth, ZcbMemOp,
};
use cranelift_control::ControlPlane;

pub struct EmitInfo {
    shared_flag: settings::Flags,
    isa_flags: super::super::riscv_settings::Flags,
}

impl EmitInfo {
    pub(crate) fn new(
        shared_flag: settings::Flags,
        isa_flags: super::super::riscv_settings::Flags,
    ) -> Self {
        Self {
            shared_flag,
            isa_flags,
        }
    }
}

pub(crate) fn reg_to_gpr_num(m: Reg) -> u32 {
    u32::from(m.to_real_reg().unwrap().hw_enc() & 31)
}

pub(crate) fn reg_to_compressed_gpr_num(m: Reg) -> u32 {
    let real_reg = m.to_real_reg().unwrap().hw_enc();
    debug_assert!(real_reg >= 8 && real_reg < 16);
    let compressed_reg = real_reg - 8;
    u32::from(compressed_reg)
}

#[derive(Clone, Debug, PartialEq, Default)]
pub enum EmitVState {
    #[default]
    Unknown,
    Known(VState),
}

/// State carried between emissions of a sequence of instructions.
#[derive(Default, Clone, Debug)]
pub struct EmitState {
    /// The user stack map for the upcoming instruction, as provided to
    /// `pre_safepoint()`.
    user_stack_map: Option<ir::UserStackMap>,

    /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and
    /// optimized away at compiletime. See [cranelift_control].
    ctrl_plane: ControlPlane,

    /// Vector State
    /// Controls the current state of the vector unit at the emission point.
    vstate: EmitVState,

    frame_layout: FrameLayout,
}

impl EmitState {
    fn take_stack_map(&mut self) -> Option<ir::UserStackMap> {
        self.user_stack_map.take()
    }
}

impl MachInstEmitState<Inst> for EmitState {
    fn new(
        abi: &Callee<crate::isa::riscv64::abi::Riscv64MachineDeps>,
        ctrl_plane: ControlPlane,
    ) -> Self {
        EmitState {
            user_stack_map: None,
            ctrl_plane,
            vstate: EmitVState::Unknown,
            frame_layout: abi.frame_layout().clone(),
        }
    }

    fn pre_safepoint(&mut self, user_stack_map: Option<ir::UserStackMap>) {
        self.user_stack_map = user_stack_map;
    }

    fn ctrl_plane_mut(&mut self) -> &mut ControlPlane {
        &mut self.ctrl_plane
    }

    fn take_ctrl_plane(self) -> ControlPlane {
        self.ctrl_plane
    }

    fn on_new_block(&mut self) {
        // Reset the vector state.
        self.vstate = EmitVState::Unknown;
    }

    fn frame_layout(&self) -> &FrameLayout {
        &self.frame_layout
    }
}

impl Inst {
    /// Load int mask.
    /// If ty is int then 0xff in rd.
    pub(crate) fn load_int_mask(rd: Writable<Reg>, ty: Type) -> SmallInstVec<Inst> {
        let mut insts = SmallInstVec::new();
        assert!(ty.is_int() && ty.bits() <= 64);
        match ty {
            I64 => {
                insts.push(Inst::load_imm12(rd, Imm12::from_i16(-1)));
            }
            I32 | I16 => {
                insts.push(Inst::load_imm12(rd, Imm12::from_i16(-1)));
                insts.push(Inst::Extend {
                    rd: rd,
                    rn: rd.to_reg(),
                    signed: false,
                    from_bits: ty.bits() as u8,
                    to_bits: 64,
                });
            }
            I8 => {
                insts.push(Inst::load_imm12(rd, Imm12::from_i16(255)));
            }
            _ => unreachable!("ty:{:?}", ty),
        }
        insts
    }
    ///  inverse all bit
    pub(crate) fn construct_bit_not(rd: Writable<Reg>, rs: Reg) -> Inst {
        Inst::AluRRImm12 {
            alu_op: AluOPRRI::Xori,
            rd,
            rs,
            imm12: Imm12::from_i16(-1),
        }
    }

    /// Returns Some(VState) if this instruction is expecting a specific vector state
    /// before emission.
    fn expected_vstate(&self) -> Option<&VState> {
        match self {
            Inst::Nop0
            | Inst::Nop4
            | Inst::BrTable { .. }
            | Inst::Auipc { .. }
            | Inst::Fli { .. }
            | Inst::Lui { .. }
            | Inst::LoadInlineConst { .. }
            | Inst::AluRRR { .. }
            | Inst::FpuRRR { .. }
            | Inst::AluRRImm12 { .. }
            | Inst::CsrReg { .. }
            | Inst::CsrImm { .. }
            | Inst::Load { .. }
            | Inst::Store { .. }
            | Inst::Args { .. }
            | Inst::Rets { .. }
            | Inst::Ret { .. }
            | Inst::Extend { .. }
            | Inst::Call { .. }
            | Inst::CallInd { .. }
            | Inst::ReturnCall { .. }
            | Inst::ReturnCallInd { .. }
            | Inst::Jal { .. }
            | Inst::CondBr { .. }
            | Inst::LoadExtName { .. }
            | Inst::ElfTlsGetAddr { .. }
            | Inst::LoadAddr { .. }
            | Inst::Mov { .. }
            | Inst::MovFromPReg { .. }
            | Inst::Fence { .. }
            | Inst::EBreak
            | Inst::Udf { .. }
            | Inst::FpuRR { .. }
            | Inst::FpuRRRR { .. }
            | Inst::Jalr { .. }
            | Inst::Atomic { .. }
            | Inst::Select { .. }
            | Inst::AtomicCas { .. }
            | Inst::RawData { .. }
            | Inst::AtomicStore { .. }
            | Inst::AtomicLoad { .. }
            | Inst::AtomicRmwLoop { .. }
            | Inst::TrapIf { .. }
            | Inst::Unwind { .. }
            | Inst::DummyUse { .. }
            | Inst::Popcnt { .. }
            | Inst::Cltz { .. }
            | Inst::Brev8 { .. }
            | Inst::StackProbeLoop { .. } => None,

            // VecSetState does not expect any vstate, rather it updates it.
            Inst::VecSetState { .. } => None,

            // `vmv` instructions copy a set of registers and ignore vstate.
            Inst::VecAluRRImm5 { op: VecAluOpRRImm5::VmvrV, .. } => None,

            Inst::VecAluRR { vstate, .. } |
            Inst::VecAluRRR { vstate, .. } |
            Inst::VecAluRRRR { vstate, .. } |
            Inst::VecAluRImm5 { vstate, .. } |
            Inst::VecAluRRImm5 { vstate, .. } |
            Inst::VecAluRRRImm5 { vstate, .. } |
            // TODO: Unit-stride loads and stores only need the AVL to be correct, not
            // the full vtype. A future optimization could be to decouple these two when
            // updating vstate. This would allow us to avoid emitting a VecSetState in
            // some cases.
            Inst::VecLoad { vstate, .. }
            | Inst::VecStore { vstate, .. } => Some(vstate),
        }
    }
}

impl MachInstEmit for Inst {
    type State = EmitState;
    type Info = EmitInfo;

    fn emit(&self, sink: &mut MachBuffer<Inst>, emit_info: &Self::Info, state: &mut EmitState) {
        // Check if we need to update the vector state before emitting this instruction
        if let Some(expected) = self.expected_vstate() {
            if state.vstate != EmitVState::Known(*expected) {
                // Update the vector state.
                Inst::VecSetState {
                    rd: writable_zero_reg(),
                    vstate: *expected,
                }
                .emit(sink, emit_info, state);
            }
        }

        // N.B.: we *must* not exceed the "worst-case size" used to compute
        // where to insert islands, except when islands are explicitly triggered
        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
        // to allow disabling the check for `JTSequence`, which is always
        // emitted following an `EmitIsland`.
        let mut start_off = sink.cur_offset();

        // First try to emit this as a compressed instruction
        let res = self.try_emit_compressed(sink, emit_info, state, &mut start_off);
        if res.is_none() {
            // If we can't lets emit it as a normal instruction
            self.emit_uncompressed(sink, emit_info, state, &mut start_off);
        }

        // We exclude br_table and return call from these checks since they emit
        // their own islands, and thus are allowed to exceed the worst case size.
        if !matches!(
            self,
            Inst::BrTable { .. } | Inst::ReturnCall { .. } | Inst::ReturnCallInd { .. }
        ) {
            let end_off = sink.cur_offset();
            assert!(
                (end_off - start_off) <= Inst::worst_case_size(),
                "Inst:{:?} length:{} worst_case_size:{}",
                self,
                end_off - start_off,
                Inst::worst_case_size()
            );
        }
    }

    fn pretty_print_inst(&self, state: &mut Self::State) -> String {
        self.print_with_state(state)
    }
}

impl Inst {
    /// Tries to emit an instruction as compressed, if we can't return false.
    fn try_emit_compressed(
        &self,
        sink: &mut MachBuffer<Inst>,
        emit_info: &EmitInfo,
        state: &mut EmitState,
        start_off: &mut u32,
    ) -> Option<()> {
        let has_m = emit_info.isa_flags.has_m();
        let has_zba = emit_info.isa_flags.has_zba();
        let has_zbb = emit_info.isa_flags.has_zbb();
        let has_zca = emit_info.isa_flags.has_zca();
        let has_zcb = emit_info.isa_flags.has_zcb();
        let has_zcd = emit_info.isa_flags.has_zcd();

        // Currently all compressed extensions (Zcb, Zcd, Zcmp, Zcmt, etc..) require Zca
        // to be enabled, so check it early.
        if !has_zca {
            return None;
        }

        fn reg_is_compressible(r: Reg) -> bool {
            r.to_real_reg()
                .map(|r| r.hw_enc() >= 8 && r.hw_enc() < 16)
                .unwrap_or(false)
        }

        match *self {
            // C.ADD
            Inst::AluRRR {
                alu_op: AluOPRRR::Add,
                rd,
                rs1,
                rs2,
            } if (rd.to_reg() == rs1 || rd.to_reg() == rs2)
                && rs1 != zero_reg()
                && rs2 != zero_reg() =>
            {
                // Technically `c.add rd, rs` expands to `add rd, rd, rs`, but we can
                // also swap rs1 with rs2 and we get an equivalent instruction. i.e we
                // can also compress `add rd, rs, rd` into `c.add rd, rs`.
                let src = if rd.to_reg() == rs1 { rs2 } else { rs1 };

                sink.put2(encode_cr_type(CrOp::CAdd, rd, src));
            }

            // C.MV
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi | AluOPRRI::Ori,
                rd,
                rs,
                imm12,
            } if rd.to_reg() != rs
                && rd.to_reg() != zero_reg()
                && rs != zero_reg()
                && imm12.as_i16() == 0 =>
            {
                sink.put2(encode_cr_type(CrOp::CMv, rd, rs));
            }

            // CA Ops
            Inst::AluRRR {
                alu_op:
                    alu_op @ (AluOPRRR::And
                    | AluOPRRR::Or
                    | AluOPRRR::Xor
                    | AluOPRRR::Addw
                    | AluOPRRR::Mul),
                rd,
                rs1,
                rs2,
            } if (rd.to_reg() == rs1 || rd.to_reg() == rs2)
                && reg_is_compressible(rs1)
                && reg_is_compressible(rs2) =>
            {
                let op = match alu_op {
                    AluOPRRR::And => CaOp::CAnd,
                    AluOPRRR::Or => CaOp::COr,
                    AluOPRRR::Xor => CaOp::CXor,
                    AluOPRRR::Addw => CaOp::CAddw,
                    AluOPRRR::Mul if has_zcb && has_m => CaOp::CMul,
                    _ => return None,
                };
                // The canonical expansion for these instruction has `rd == rs1`, but
                // these are all commutative operations, so we can swap the operands.
                let src = if rd.to_reg() == rs1 { rs2 } else { rs1 };

                sink.put2(encode_ca_type(op, rd, src));
            }

            // The sub instructions are non commutative, so we can't swap the operands.
            Inst::AluRRR {
                alu_op: alu_op @ (AluOPRRR::Sub | AluOPRRR::Subw),
                rd,
                rs1,
                rs2,
            } if rd.to_reg() == rs1 && reg_is_compressible(rs1) && reg_is_compressible(rs2) => {
                let op = match alu_op {
                    AluOPRRR::Sub => CaOp::CSub,
                    AluOPRRR::Subw => CaOp::CSubw,
                    _ => return None,
                };
                sink.put2(encode_ca_type(op, rd, rs2));
            }

            // c.j
            //
            // We don't have a separate JAL as that is only available in RV32C
            Inst::Jal { label } => {
                sink.use_label_at_offset(*start_off, label, LabelUse::RVCJump);
                sink.add_uncond_branch(*start_off, *start_off + 2, label);
                sink.put2(encode_cj_type(CjOp::CJ, Imm12::ZERO));
            }

            // c.jr
            Inst::Jalr { rd, base, offset }
                if rd.to_reg() == zero_reg() && base != zero_reg() && offset.as_i16() == 0 =>
            {
                sink.put2(encode_cr2_type(CrOp::CJr, base));
            }

            // c.jalr
            Inst::Jalr { rd, base, offset }
                if rd.to_reg() == link_reg() && base != zero_reg() && offset.as_i16() == 0 =>
            {
                sink.put2(encode_cr2_type(CrOp::CJalr, base));
            }

            // c.ebreak
            Inst::EBreak => {
                sink.put2(encode_cr_type(
                    CrOp::CEbreak,
                    writable_zero_reg(),
                    zero_reg(),
                ));
            }

            // c.unimp
            Inst::Udf { trap_code } => {
                sink.add_trap(trap_code);
                sink.put2(0x0000);
            }
            // c.addi16sp
            //
            // c.addi16sp shares the opcode with c.lui, but has a destination field of x2.
            // c.addi16sp adds the non-zero sign-extended 6-bit immediate to the value in the stack pointer (sp=x2),
            // where the immediate is scaled to represent multiples of 16 in the range (-512,496). c.addi16sp is used
            // to adjust the stack pointer in procedure prologues and epilogues. It expands into addi x2, x2, nzimm. c.addi16sp
            // is only valid when nzimm≠0; the code point with nzimm=0 is reserved.
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs
                && rs == stack_reg()
                && imm12.as_i16() != 0
                && (imm12.as_i16() % 16) == 0
                && Imm6::maybe_from_i16(imm12.as_i16() / 16).is_some() =>
            {
                let imm6 = Imm6::maybe_from_i16(imm12.as_i16() / 16).unwrap();
                sink.put2(encode_c_addi16sp(imm6));
            }

            // c.addi4spn
            //
            // c.addi4spn is a CIW-format instruction that adds a zero-extended non-zero
            // immediate, scaled by 4, to the stack pointer, x2, and writes the result to
            // rd. This instruction is used to generate pointers to stack-allocated variables
            // and expands to addi rd, x2, nzuimm. c.addi4spn is only valid when nzuimm≠0;
            // the code points with nzuimm=0 are reserved.
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if reg_is_compressible(rd.to_reg())
                && rs == stack_reg()
                && imm12.as_i16() != 0
                && (imm12.as_i16() % 4) == 0
                && u8::try_from(imm12.as_i16() / 4).is_ok() =>
            {
                let imm = u8::try_from(imm12.as_i16() / 4).unwrap();
                sink.put2(encode_ciw_type(CiwOp::CAddi4spn, rd, imm));
            }

            // c.li
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() != zero_reg() && rs == zero_reg() => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_ci_type(CiOp::CLi, rd, imm6));
            }

            // c.addi
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && rs != zero_reg() && imm12.as_i16() != 0 => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_ci_type(CiOp::CAddi, rd, imm6));
            }

            // c.addiw
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addiw,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && rs != zero_reg() => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_ci_type(CiOp::CAddiw, rd, imm6));
            }

            // c.lui
            //
            // c.lui loads the non-zero 6-bit immediate field into bits 17–12
            // of the destination register, clears the bottom 12 bits, and
            // sign-extends bit 17 into all higher bits of the destination.
            Inst::Lui { rd, imm: imm20 }
                if rd.to_reg() != zero_reg()
                    && rd.to_reg() != stack_reg()
                    && imm20.as_i32() != 0 =>
            {
                // Check that the top bits are sign extended
                let imm = imm20.as_i32() << 14 >> 14;
                if imm != imm20.as_i32() {
                    return None;
                }
                let imm6 = Imm6::maybe_from_i32(imm)?;
                sink.put2(encode_ci_type(CiOp::CLui, rd, imm6));
            }

            // c.slli
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Slli,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && rs != zero_reg() && imm12.as_i16() != 0 => {
                // The shift amount is unsigned, but we encode it as signed.
                let shift = imm12.as_i16() & 0x3f;
                let imm6 = Imm6::maybe_from_i16(shift << 10 >> 10).unwrap();
                sink.put2(encode_ci_type(CiOp::CSlli, rd, imm6));
            }

            // c.srli / c.srai
            Inst::AluRRImm12 {
                alu_op: op @ (AluOPRRI::Srli | AluOPRRI::Srai),
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && reg_is_compressible(rs) && imm12.as_i16() != 0 => {
                let op = match op {
                    AluOPRRI::Srli => CbOp::CSrli,
                    AluOPRRI::Srai => CbOp::CSrai,
                    _ => unreachable!(),
                };

                // The shift amount is unsigned, but we encode it as signed.
                let shift = imm12.as_i16() & 0x3f;
                let imm6 = Imm6::maybe_from_i16(shift << 10 >> 10).unwrap();
                sink.put2(encode_cb_type(op, rd, imm6));
            }

            // c.zextb
            //
            // This is an alias for `andi rd, rd, 0xff`
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Andi,
                rd,
                rs,
                imm12,
            } if has_zcb
                && rd.to_reg() == rs
                && reg_is_compressible(rs)
                && imm12.as_i16() == 0xff =>
            {
                sink.put2(encode_cszn_type(CsznOp::CZextb, rd));
            }

            // c.andi
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Andi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && reg_is_compressible(rs) => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_cb_type(CbOp::CAndi, rd, imm6));
            }

            // Stack Based Loads
            Inst::Load {
                rd,
                op: op @ (LoadOP::Lw | LoadOP::Ld | LoadOP::Fld),
                from,
                flags,
            } if from.get_base_register() == Some(stack_reg())
                && (from.get_offset_with_state(state) % op.size()) == 0 =>
            {
                // We encode the offset in multiples of the load size.
                let offset = from.get_offset_with_state(state);
                let imm6 = u8::try_from(offset / op.size())
                    .ok()
                    .and_then(Uimm6::maybe_from_u8)?;

                // Some additional constraints on these instructions.
                //
                // Integer loads are not allowed to target x0, but floating point loads
                // are, since f0 is not a special register.
                //
                // Floating point loads are not included in the base Zca extension
                // but in a separate Zcd extension. Both of these are part of the C Extension.
                let rd_is_zero = rd.to_reg() == zero_reg();
                let op = match op {
                    LoadOP::Lw if !rd_is_zero => CiOp::CLwsp,
                    LoadOP::Ld if !rd_is_zero => CiOp::CLdsp,
                    LoadOP::Fld if has_zcd => CiOp::CFldsp,
                    _ => return None,
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }
                sink.put2(encode_ci_sp_load(op, rd, imm6));
            }

            // Regular Loads
            Inst::Load {
                rd,
                op:
                    op
                    @ (LoadOP::Lw | LoadOP::Ld | LoadOP::Fld | LoadOP::Lbu | LoadOP::Lhu | LoadOP::Lh),
                from,
                flags,
            } if reg_is_compressible(rd.to_reg())
                && from
                    .get_base_register()
                    .map(reg_is_compressible)
                    .unwrap_or(false)
                && (from.get_offset_with_state(state) % op.size()) == 0 =>
            {
                let base = from.get_base_register().unwrap();

                // We encode the offset in multiples of the store size.
                let offset = from.get_offset_with_state(state);
                let offset = u8::try_from(offset / op.size()).ok()?;

                // We mix two different formats here.
                //
                // c.lw / c.ld / c.fld instructions are available in the standard Zca
                // extension using the CL format.
                //
                // c.lbu / c.lhu / c.lh are only available in the Zcb extension and
                // are also encoded differently. Technically they each have a different
                // format, but they are similar enough that we can group them.
                let is_zcb_load = matches!(op, LoadOP::Lbu | LoadOP::Lhu | LoadOP::Lh);
                let encoded = if is_zcb_load {
                    if !has_zcb {
                        return None;
                    }

                    let op = match op {
                        LoadOP::Lbu => ZcbMemOp::CLbu,
                        LoadOP::Lhu => ZcbMemOp::CLhu,
                        LoadOP::Lh => ZcbMemOp::CLh,
                        _ => unreachable!(),
                    };

                    // Byte stores & loads have 2 bits of immediate offset. Halfword stores
                    // and loads only have 1 bit.
                    let imm2 = Uimm2::maybe_from_u8(offset)?;
                    if (offset & !((1 << op.imm_bits()) - 1)) != 0 {
                        return None;
                    }

                    encode_zcbmem_load(op, rd, base, imm2)
                } else {
                    // Floating point loads are not included in the base Zca extension
                    // but in a separate Zcd extension. Both of these are part of the C Extension.
                    let op = match op {
                        LoadOP::Lw => ClOp::CLw,
                        LoadOP::Ld => ClOp::CLd,
                        LoadOP::Fld if has_zcd => ClOp::CFld,
                        _ => return None,
                    };
                    let imm5 = Uimm5::maybe_from_u8(offset)?;

                    encode_cl_type(op, rd, base, imm5)
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }
                sink.put2(encoded);
            }

            // Stack Based Stores
            Inst::Store {
                src,
                op: op @ (StoreOP::Sw | StoreOP::Sd | StoreOP::Fsd),
                to,
                flags,
            } if to.get_base_register() == Some(stack_reg())
                && (to.get_offset_with_state(state) % op.size()) == 0 =>
            {
                // We encode the offset in multiples of the store size.
                let offset = to.get_offset_with_state(state);
                let imm6 = u8::try_from(offset / op.size())
                    .ok()
                    .and_then(Uimm6::maybe_from_u8)?;

                // Floating point stores are not included in the base Zca extension
                // but in a separate Zcd extension. Both of these are part of the C Extension.
                let op = match op {
                    StoreOP::Sw => CssOp::CSwsp,
                    StoreOP::Sd => CssOp::CSdsp,
                    StoreOP::Fsd if has_zcd => CssOp::CFsdsp,
                    _ => return None,
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }
                sink.put2(encode_css_type(op, src, imm6));
            }

            // Regular Stores
            Inst::Store {
                src,
                op: op @ (StoreOP::Sw | StoreOP::Sd | StoreOP::Fsd | StoreOP::Sh | StoreOP::Sb),
                to,
                flags,
            } if reg_is_compressible(src)
                && to
                    .get_base_register()
                    .map(reg_is_compressible)
                    .unwrap_or(false)
                && (to.get_offset_with_state(state) % op.size()) == 0 =>
            {
                let base = to.get_base_register().unwrap();

                // We encode the offset in multiples of the store size.
                let offset = to.get_offset_with_state(state);
                let offset = u8::try_from(offset / op.size()).ok()?;

                // We mix two different formats here.
                //
                // c.sw / c.sd / c.fsd instructions are available in the standard Zca
                // extension using the CL format.
                //
                // c.sb / c.sh are only available in the Zcb extension and are also
                // encoded differently.
                let is_zcb_store = matches!(op, StoreOP::Sh | StoreOP::Sb);
                let encoded = if is_zcb_store {
                    if !has_zcb {
                        return None;
                    }

                    let op = match op {
                        StoreOP::Sh => ZcbMemOp::CSh,
                        StoreOP::Sb => ZcbMemOp::CSb,
                        _ => unreachable!(),
                    };

                    // Byte stores & loads have 2 bits of immediate offset. Halfword stores
                    // and loads only have 1 bit.
                    let imm2 = Uimm2::maybe_from_u8(offset)?;
                    if (offset & !((1 << op.imm_bits()) - 1)) != 0 {
                        return None;
                    }

                    encode_zcbmem_store(op, src, base, imm2)
                } else {
                    // Floating point stores are not included in the base Zca extension
                    // but in a separate Zcd extension. Both of these are part of the C Extension.
                    let op = match op {
                        StoreOP::Sw => CsOp::CSw,
                        StoreOP::Sd => CsOp::CSd,
                        StoreOP::Fsd if has_zcd => CsOp::CFsd,
                        _ => return None,
                    };
                    let imm5 = Uimm5::maybe_from_u8(offset)?;

                    encode_cs_type(op, src, base, imm5)
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }
                sink.put2(encoded);
            }

            // c.not
            //
            // This is an alias for `xori rd, rd, -1`
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Xori,
                rd,
                rs,
                imm12,
            } if has_zcb
                && rd.to_reg() == rs
                && reg_is_compressible(rs)
                && imm12.as_i16() == -1 =>
            {
                sink.put2(encode_cszn_type(CsznOp::CNot, rd));
            }

            // c.sext.b / c.sext.h / c.zext.h
            //
            // These are all the extend instructions present in `Zcb`, they
            // also require `Zbb` since they aren't available in the base ISA.
            Inst::AluRRImm12 {
                alu_op: alu_op @ (AluOPRRI::Sextb | AluOPRRI::Sexth | AluOPRRI::Zexth),
                rd,
                rs,
                imm12,
            } if has_zcb
                && has_zbb
                && rd.to_reg() == rs
                && reg_is_compressible(rs)
                && imm12.as_i16() == 0 =>
            {
                let op = match alu_op {
                    AluOPRRI::Sextb => CsznOp::CSextb,
                    AluOPRRI::Sexth => CsznOp::CSexth,
                    AluOPRRI::Zexth => CsznOp::CZexth,
                    _ => unreachable!(),
                };
                sink.put2(encode_cszn_type(op, rd));
            }

            // c.zext.w
            //
            // This is an alias for `add.uw rd, rd, zero`
            Inst::AluRRR {
                alu_op: AluOPRRR::Adduw,
                rd,
                rs1,
                rs2,
            } if has_zcb
                && has_zba
                && rd.to_reg() == rs1
                && reg_is_compressible(rs1)
                && rs2 == zero_reg() =>
            {
                sink.put2(encode_cszn_type(CsznOp::CZextw, rd));
            }

            _ => return None,
        }

        return Some(());
    }

    fn emit_uncompressed(
        &self,
        sink: &mut MachBuffer<Inst>,
        emit_info: &EmitInfo,
        state: &mut EmitState,
        start_off: &mut u32,
    ) {
        match self {
            &Inst::Nop0 => {
                // do nothing
            }
            // Addi x0, x0, 0
            &Inst::Nop4 => {
                let x = Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Addi,
                    rd: Writable::from_reg(zero_reg()),
                    rs: zero_reg(),
                    imm12: Imm12::ZERO,
                };
                x.emit(sink, emit_info, state)
            }
            &Inst::RawData { ref data } => {
                // Right now we only put a u32 or u64 in this instruction.
                // It is not very long, no need to check if need `emit_island`.
                // If data is very long , this is a bug because RawData is typically
                // use to load some data and rely on some position in the code stream.
                // and we may exceed `Inst::worst_case_size`.
                // for more information see https://github.com/bytecodealliance/wasmtime/pull/5612.
                sink.put_data(&data[..]);
            }
            &Inst::Lui { rd, ref imm } => {
                let x: u32 = 0b0110111 | reg_to_gpr_num(rd.to_reg()) << 7 | (imm.bits() << 12);
                sink.put4(x);
            }
            &Inst::Fli { rd, ty, imm } => {
                sink.put4(encode_fli(ty, imm, rd));
            }
            &Inst::LoadInlineConst { rd, ty, imm } => {
                let data = &imm.to_le_bytes()[..ty.bytes() as usize];

                let label_data: MachLabel = sink.get_label();
                let label_end: MachLabel = sink.get_label();

                // Load into rd
                Inst::Load {
                    rd,
                    op: LoadOP::from_type(ty),
                    flags: MemFlags::new(),
                    from: AMode::Label(label_data),
                }
                .emit(sink, emit_info, state);

                // Jump over the inline pool
                Inst::gen_jump(label_end).emit(sink, emit_info, state);

                // Emit the inline data
                sink.bind_label(label_data, &mut state.ctrl_plane);
                Inst::RawData { data: data.into() }.emit(sink, emit_info, state);

                sink.bind_label(label_end, &mut state.ctrl_plane);
            }
            &Inst::FpuRR {
                alu_op,
                width,
                frm,
                rd,
                rs,
            } => {
                if alu_op.is_convert_to_int() {
                    sink.add_trap(TrapCode::BAD_CONVERSION_TO_INTEGER);
                }
                sink.put4(encode_fp_rr(alu_op, width, frm, rd, rs));
            }
            &Inst::FpuRRRR {
                alu_op,
                rd,
                rs1,
                rs2,
                rs3,
                frm,
                width,
            } => {
                sink.put4(encode_fp_rrrr(alu_op, width, frm, rd, rs1, rs2, rs3));
            }
            &Inst::FpuRRR {
                alu_op,
                width,
                frm,
                rd,
                rs1,
                rs2,
            } => {
                sink.put4(encode_fp_rrr(alu_op, width, frm, rd, rs1, rs2));
            }
            &Inst::Unwind { ref inst } => {
                sink.add_unwind(inst.clone());
            }
            &Inst::DummyUse { .. } => {
                // This has already been handled by Inst::allocate.
            }
            &Inst::AluRRR {
                alu_op,
                rd,
                rs1,
                rs2,
            } => {
                let (rs1, rs2) = if alu_op.reverse_rs() {
                    (rs2, rs1)
                } else {
                    (rs1, rs2)
                };

                sink.put4(encode_r_type(
                    alu_op.op_code(),
                    rd,
                    alu_op.funct3(),
                    rs1,
                    rs2,
                    alu_op.funct7(),
                ));
            }
            &Inst::AluRRImm12 {
                alu_op,
                rd,
                rs,
                imm12,
            } => {
                let x = alu_op.op_code()
                    | reg_to_gpr_num(rd.to_reg()) << 7
                    | alu_op.funct3() << 12
                    | reg_to_gpr_num(rs) << 15
                    | alu_op.imm12(imm12) << 20;
                sink.put4(x);
            }
            &Inst::CsrReg { op, rd, rs, csr } => {
                sink.put4(encode_csr_reg(op, rd, rs, csr));
            }
            &Inst::CsrImm { op, rd, csr, imm } => {
                sink.put4(encode_csr_imm(op, rd, csr, imm));
            }
            &Inst::Load {
                rd,
                op,
                from,
                flags,
            } => {
                let base = from.get_base_register();
                let offset = from.get_offset_with_state(state);
                let offset_imm12 = Imm12::maybe_from_i64(offset);
                let label = from.get_label_with_sink(sink);

                let (addr, imm12) = match (base, offset_imm12, label) {
                    // When loading from a Reg+Offset, if the offset fits into an imm12 we can directly encode it.
                    (Some(base), Some(imm12), None) => (base, imm12),

                    // Otherwise, if the offset does not fit into a imm12, we need to materialize it into a
                    // register and load from that.
                    (Some(_), None, None) => {
                        let tmp = writable_spilltmp_reg();
                        Inst::LoadAddr { rd: tmp, mem: from }.emit(sink, emit_info, state);
                        (tmp.to_reg(), Imm12::ZERO)
                    }

                    // If the AMode contains a label we can emit an internal relocation that gets
                    // resolved with the correct address later.
                    (None, Some(imm), Some(label)) => {
                        debug_assert_eq!(imm.as_i16(), 0);

                        // Get the current PC.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelHi20);
                        Inst::Auipc {
                            rd,
                            imm: Imm20::ZERO,
                        }
                        .emit_uncompressed(sink, emit_info, state, start_off);

                        // Emit a relocation for the load. This patches the offset into the instruction.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelLo12I);

                        // Imm12 here is meaningless since it's going to get replaced.
                        (rd.to_reg(), Imm12::ZERO)
                    }

                    // These cases are impossible with the current AModes that we have. We either
                    // always have a register, or always have a label. Never both, and never neither.
                    (None, None, None)
                    | (None, Some(_), None)
                    | (Some(_), None, Some(_))
                    | (Some(_), Some(_), Some(_))
                    | (None, None, Some(_)) => {
                        unreachable!("Invalid load address")
                    }
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }

                sink.put4(encode_i_type(op.op_code(), rd, op.funct3(), addr, imm12));
            }
            &Inst::Store { op, src, flags, to } => {
                let base = to.get_base_register();
                let offset = to.get_offset_with_state(state);
                let offset_imm12 = Imm12::maybe_from_i64(offset);

                let (addr, imm12) = match (base, offset_imm12) {
                    // If the offset fits into an imm12 we can directly encode it.
                    (Some(base), Some(imm12)) => (base, imm12),
                    // Otherwise load the address it into a reg and load from it.
                    _ => {
                        let tmp = writable_spilltmp_reg();
                        Inst::LoadAddr { rd: tmp, mem: to }.emit(sink, emit_info, state);
                        (tmp.to_reg(), Imm12::ZERO)
                    }
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }

                sink.put4(encode_s_type(op.op_code(), op.funct3(), addr, src, imm12));
            }
            &Inst::Args { .. } | &Inst::Rets { .. } => {
                // Nothing: this is a pseudoinstruction that serves
                // only to constrain registers at a certain point.
            }
            &Inst::Ret {} => {
                // RISC-V does not have a dedicated ret instruction, instead we emit the equivalent
                // `jalr x0, x1, 0` that jumps to the return address.
                Inst::Jalr {
                    rd: writable_zero_reg(),
                    base: link_reg(),
                    offset: Imm12::ZERO,
                }
                .emit(sink, emit_info, state);
            }

            &Inst::Extend {
                rd,
                rn,
                signed,
                from_bits,
                to_bits: _to_bits,
            } => {
                let mut insts = SmallInstVec::new();
                let shift_bits = (64 - from_bits) as i16;
                let is_u8 = || from_bits == 8 && signed == false;
                if is_u8() {
                    // special for u8.
                    insts.push(Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Andi,
                        rd,
                        rs: rn,
                        imm12: Imm12::from_i16(255),
                    });
                } else {
                    insts.push(Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Slli,
                        rd,
                        rs: rn,
                        imm12: Imm12::from_i16(shift_bits),
                    });
                    insts.push(Inst::AluRRImm12 {
                        alu_op: if signed {
                            AluOPRRI::Srai
                        } else {
                            AluOPRRI::Srli
                        },
                        rd,
                        rs: rd.to_reg(),
                        imm12: Imm12::from_i16(shift_bits),
                    });
                }
                insts
                    .into_iter()
                    .for_each(|i| i.emit(sink, emit_info, state));
            }

            &Inst::Call { ref info } => {
                sink.add_call_site();
                sink.add_reloc(Reloc::RiscvCallPlt, &info.dest, 0);

                Inst::construct_auipc_and_jalr(Some(writable_link_reg()), writable_link_reg(), 0)
                    .into_iter()
                    .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));

                if let Some(s) = state.take_stack_map() {
                    let offset = sink.cur_offset();
                    sink.push_user_stack_map(state, offset, s);
                }

                let callee_pop_size = i32::try_from(info.callee_pop_size).unwrap();
                if callee_pop_size > 0 {
                    for inst in Riscv64MachineDeps::gen_sp_reg_adjust(-callee_pop_size) {
                        inst.emit(sink, emit_info, state);
                    }
                }
            }
            &Inst::CallInd { ref info } => {
                Inst::Jalr {
                    rd: writable_link_reg(),
                    base: info.dest,
                    offset: Imm12::ZERO,
                }
                .emit(sink, emit_info, state);

                if let Some(s) = state.take_stack_map() {
                    let offset = sink.cur_offset();
                    sink.push_user_stack_map(state, offset, s);
                }

                sink.add_call_site();

                let callee_pop_size = i32::try_from(info.callee_pop_size).unwrap();
                if callee_pop_size > 0 {
                    for inst in Riscv64MachineDeps::gen_sp_reg_adjust(-callee_pop_size) {
                        inst.emit(sink, emit_info, state);
                    }
                }
            }

            &Inst::ReturnCall { ref info } => {
                emit_return_call_common_sequence(sink, emit_info, state, info);

                sink.add_call_site();
                sink.add_reloc(Reloc::RiscvCallPlt, &info.dest, 0);
                Inst::construct_auipc_and_jalr(None, writable_spilltmp_reg(), 0)
                    .into_iter()
                    .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));
            }

            &Inst::ReturnCallInd { ref info } => {
                emit_return_call_common_sequence(sink, emit_info, state, &info);

                Inst::Jalr {
                    rd: writable_zero_reg(),
                    base: info.dest,
                    offset: Imm12::ZERO,
                }
                .emit(sink, emit_info, state);
            }
            &Inst::Jal { label } => {
                sink.use_label_at_offset(*start_off, label, LabelUse::Jal20);
                sink.add_uncond_branch(*start_off, *start_off + 4, label);
                sink.put4(0b1101111);
            }
            &Inst::CondBr {
                taken,
                not_taken,
                kind,
            } => {
                match taken {
                    CondBrTarget::Label(label) => {
                        let code = kind.emit();
                        let code_inverse = kind.inverse().emit().to_le_bytes();
                        sink.use_label_at_offset(*start_off, label, LabelUse::B12);
                        sink.add_cond_branch(*start_off, *start_off + 4, label, &code_inverse);
                        sink.put4(code);
                    }
                    CondBrTarget::Fallthrough => panic!("Cannot fallthrough in taken target"),
                }

                match not_taken {
                    CondBrTarget::Label(label) => {
                        Inst::gen_jump(label).emit(sink, emit_info, state)
                    }
                    CondBrTarget::Fallthrough => {}
                };
            }

            &Inst::Mov { rd, rm, ty } => {
                debug_assert_eq!(rd.to_reg().class(), rm.class());
                if rd.to_reg() == rm {
                    return;
                }

                match rm.class() {
                    RegClass::Int => Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: rd,
                        rs: rm,
                        imm12: Imm12::ZERO,
                    },
                    RegClass::Float => Inst::FpuRRR {
                        alu_op: FpuOPRRR::Fsgnj,
                        width: FpuOPWidth::try_from(ty).unwrap(),
                        frm: FRM::RNE,
                        rd: rd,
                        rs1: rm,
                        rs2: rm,
                    },
                    RegClass::Vector => Inst::VecAluRRImm5 {
                        op: VecAluOpRRImm5::VmvrV,
                        vd: rd,
                        vs2: rm,
                        // Imm 0 means copy 1 register.
                        imm: Imm5::maybe_from_i8(0).unwrap(),
                        mask: VecOpMasking::Disabled,
                        // Vstate for this instruction is ignored.
                        vstate: VState::from_type(ty),
                    },
                }
                .emit(sink, emit_info, state);
            }

            &Inst::MovFromPReg { rd, rm } => {
                Inst::gen_move(rd, Reg::from(rm), I64).emit(sink, emit_info, state);
            }

            &Inst::BrTable {
                index,
                tmp1,
                tmp2,
                ref targets,
            } => {
                let ext_index = writable_spilltmp_reg();

                let label_compute_target = sink.get_label();

                // The default target is passed in as the 0th element of `targets`
                // separate it here for clarity.
                let default_target = targets[0];
                let targets = &targets[1..];

                // We are going to potentially emit a large amount of instructions, so ensure that we emit an island
                // now if we need one.
                //
                // The worse case PC calculations are 12 instructions. And each entry in the jump table is 2 instructions.
                // Check if we need to emit a jump table here to support that jump.
                let inst_count = 12 + (targets.len() * 2);
                let distance = (inst_count * Inst::UNCOMPRESSED_INSTRUCTION_SIZE as usize) as u32;
                if sink.island_needed(distance) {
                    let jump_around_label = sink.get_label();
                    Inst::gen_jump(jump_around_label).emit(sink, emit_info, state);
                    sink.emit_island(distance + 4, &mut state.ctrl_plane);
                    sink.bind_label(jump_around_label, &mut state.ctrl_plane);
                }

                // We emit a bounds check on the index, if the index is larger than the number of
                // jump table entries, we jump to the default block.  Otherwise we compute a jump
                // offset by multiplying the index by 8 (the size of each entry) and then jump to
                // that offset. Each jump table entry is a regular auipc+jalr which we emit sequentially.
                //
                // Build the following sequence:
                //
                // extend_index:
                //     zext.w  ext_index, index
                // bounds_check:
                //     li      tmp, n_labels
                //     bltu    ext_index, tmp, compute_target
                // jump_to_default_block:
                //     auipc   pc, 0
                //     jalr    zero, pc, default_block
                // compute_target:
                //     auipc   pc, 0
                //     slli    tmp, ext_index, 3
                //     add     pc, pc, tmp
                //     jalr    zero, pc, 0x10
                // jump_table:
                //     ; This repeats for each entry in the jumptable
                //     auipc   pc, 0
                //     jalr    zero, pc, block_target

                // Extend the index to 64 bits.
                //
                // This prevents us branching on the top 32 bits of the index, which
                // are undefined.
                Inst::Extend {
                    rd: ext_index,
                    rn: index,
                    signed: false,
                    from_bits: 32,
                    to_bits: 64,
                }
                .emit(sink, emit_info, state);

                // Bounds check.
                //
                // Check if the index passed in is larger than the number of jumptable
                // entries that we have. If it is, we fallthrough to a jump into the
                // default block.
                Inst::load_constant_u32(tmp2, targets.len() as u64)
                    .iter()
                    .for_each(|i| i.emit(sink, emit_info, state));
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_compute_target),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::UnsignedLessThan,
                        rs1: ext_index.to_reg(),
                        rs2: tmp2.to_reg(),
                    },
                }
                .emit(sink, emit_info, state);

                sink.use_label_at_offset(sink.cur_offset(), default_target, LabelUse::PCRel32);
                Inst::construct_auipc_and_jalr(None, tmp2, 0)
                    .iter()
                    .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));

                // Compute the jump table offset.
                // We need to emit a PC relative offset,
                sink.bind_label(label_compute_target, &mut state.ctrl_plane);

                // Get the current PC.
                Inst::Auipc {
                    rd: tmp1,
                    imm: Imm20::ZERO,
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // These instructions must be emitted as uncompressed since we
                // are manually computing the offset from the PC.

                // Multiply the index by 8, since that is the size in
                // bytes of each jump table entry
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp2,
                    rs: ext_index.to_reg(),
                    imm12: Imm12::from_i16(3),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // Calculate the base of the jump, PC + the offset from above.
                Inst::AluRRR {
                    alu_op: AluOPRRR::Add,
                    rd: tmp1,
                    rs1: tmp1.to_reg(),
                    rs2: tmp2.to_reg(),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // Jump to the middle of the jump table.
                // We add a 16 byte offset here, since we used 4 instructions
                // since the AUIPC that was used to get the PC.
                Inst::Jalr {
                    rd: writable_zero_reg(),
                    base: tmp1.to_reg(),
                    offset: Imm12::from_i16((4 * Inst::UNCOMPRESSED_INSTRUCTION_SIZE) as i16),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // Emit the jump table.
                //
                // Each entry is a auipc + jalr to the target block. We also start with a island
                // if necessary.

                // Emit the jumps back to back
                for target in targets.iter() {
                    sink.use_label_at_offset(sink.cur_offset(), *target, LabelUse::PCRel32);

                    Inst::construct_auipc_and_jalr(None, tmp2, 0)
                        .iter()
                        .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));
                }

                // We've just emitted an island that is safe up to *here*.
                // Mark it as such so that we don't needlessly emit additional islands.
                *start_off = sink.cur_offset();
            }

            &Inst::Atomic {
                op,
                rd,
                addr,
                src,
                amo,
            } => {
                // TODO: get flags from original CLIF atomic instruction
                let flags = MemFlags::new();
                if let Some(trap_code) = flags.trap_code() {
                    sink.add_trap(trap_code);
                }
                let x = op.op_code()
                    | reg_to_gpr_num(rd.to_reg()) << 7
                    | op.funct3() << 12
                    | reg_to_gpr_num(addr) << 15
                    | reg_to_gpr_num(src) << 20
                    | op.funct7(amo) << 25;

                sink.put4(x);
            }
            &Inst::Fence { pred, succ } => {
                let x = 0b0001111
                    | 0b00000 << 7
                    | 0b000 << 12
                    | 0b00000 << 15
                    | (succ as u32) << 20
                    | (pred as u32) << 24;

                sink.put4(x);
            }
            &Inst::Auipc { rd, imm } => {
                sink.put4(enc_auipc(rd, imm));
            }

            &Inst::LoadAddr { rd, mem } => {
                let base = mem.get_base_register();
                let offset = mem.get_offset_with_state(state);
                let offset_imm12 = Imm12::maybe_from_i64(offset);

                match (mem, base, offset_imm12) {
                    (_, Some(rs), Some(imm12)) => {
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Addi,
                            rd,
                            rs,
                            imm12,
                        }
                        .emit(sink, emit_info, state);
                    }
                    (_, Some(rs), None) => {
                        let mut insts = Inst::load_constant_u64(rd, offset as u64);
                        insts.push(Inst::AluRRR {
                            alu_op: AluOPRRR::Add,
                            rd,
                            rs1: rd.to_reg(),
                            rs2: rs,
                        });
                        insts
                            .into_iter()
                            .for_each(|inst| inst.emit(sink, emit_info, state));
                    }
                    (AMode::Const(addr), None, _) => {
                        // Get an address label for the constant and recurse.
                        let label = sink.get_label_for_constant(addr);
                        Inst::LoadAddr {
                            rd,
                            mem: AMode::Label(label),
                        }
                        .emit(sink, emit_info, state);
                    }
                    (AMode::Label(label), None, _) => {
                        // Get the current PC.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelHi20);
                        let inst = Inst::Auipc {
                            rd,
                            imm: Imm20::ZERO,
                        };
                        inst.emit_uncompressed(sink, emit_info, state, start_off);

                        // Emit an add to the address with a relocation.
                        // This later gets patched up with the correct offset.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelLo12I);
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Addi,
                            rd,
                            rs: rd.to_reg(),
                            imm12: Imm12::ZERO,
                        }
                        .emit_uncompressed(sink, emit_info, state, start_off);
                    }
                    (amode, _, _) => {
                        unimplemented!("LoadAddr: {:?}", amode);
                    }
                }
            }

            &Inst::Select {
                ref dst,
                condition,
                ref x,
                ref y,
            } => {
                // The general form for this select is the following:
                //
                //     mv rd, x
                //     b{cond} rcond, label_end
                //     mv rd, y
                // label_end:
                //     ... etc
                //
                // This is built on the assumption that moves are cheap, but branches and jumps
                // are not. So with this format we always avoid one jump instruction at the expense
                // of an unconditional move.
                //
                // We also perform another optimization here. If the destination register is the same
                // as one of the input registers, we can avoid emitting the first unconditional move
                // and emit just the branch and the second move.
                //
                // To make sure that this happens as often as possible, we also try to invert the
                // condition, so that if either of the input registers are the same as the destination
                // we avoid that move.

                let label_end = sink.get_label();

                let xregs = x.regs();
                let yregs = y.regs();
                let dstregs: Vec<Reg> = dst.regs().into_iter().map(|r| r.to_reg()).collect();
                let condregs = condition.regs();

                // We are going to write to the destination register before evaluating
                // the condition, so we need to make sure that the destination register
                // is not one of the condition registers.
                //
                // This should never happen, since hopefully the regalloc constraints
                // for this register are set up correctly.
                debug_assert_ne!(dstregs, condregs);

                // Check if we can invert the condition and avoid moving the y registers into
                // the destination. This allows us to only emit the branch and one of the moves.
                let (uncond_move, cond_move, condition) = if yregs == dstregs {
                    (yregs, xregs, condition.inverse())
                } else {
                    (xregs, yregs, condition)
                };

                // Unconditionally move one of the values to the destination register.
                //
                // These moves may not end up being emitted if the source and
                // destination registers are the same. That logic is built into
                // the emit function for `Inst::Mov`.
                for i in gen_moves(dst.regs(), uncond_move) {
                    i.emit(sink, emit_info, state);
                }

                // If the condition passes we skip over the conditional move
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_end),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: condition,
                }
                .emit(sink, emit_info, state);

                // Move the conditional value to the destination register.
                for i in gen_moves(dst.regs(), cond_move) {
                    i.emit(sink, emit_info, state);
                }

                sink.bind_label(label_end, &mut state.ctrl_plane);
            }
            &Inst::Jalr { rd, base, offset } => {
                sink.put4(enc_jalr(rd, base, offset));
            }
            &Inst::EBreak => {
                sink.put4(0x00100073);
            }
            &Inst::AtomicCas {
                offset,
                t0,
                dst,
                e,
                addr,
                v,
                ty,
            } => {
                //     # addr holds address of memory location
                //     # e holds expected value
                //     # v holds desired value
                //     # dst holds return value
                // cas:
                //     lr.w dst, (addr)       # Load original value.
                //     bne dst, e, fail       # Doesn’t match, so fail.
                //     sc.w t0, v, (addr)     # Try to update.
                //     bnez t0 , cas          # if store not ok,retry.
                // fail:
                let fail_label = sink.get_label();
                let cas_lebel = sink.get_label();
                sink.bind_label(cas_lebel, &mut state.ctrl_plane);
                Inst::Atomic {
                    op: AtomicOP::load_op(ty),
                    rd: dst,
                    addr,
                    src: zero_reg(),
                    amo: AMO::SeqCst,
                }
                .emit(sink, emit_info, state);
                if ty.bits() < 32 {
                    AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                        .iter()
                        .for_each(|i| i.emit(sink, emit_info, state));
                } else if ty.bits() == 32 {
                    Inst::Extend {
                        rd: dst,
                        rn: dst.to_reg(),
                        signed: false,
                        from_bits: 32,
                        to_bits: 64,
                    }
                    .emit(sink, emit_info, state);
                }
                Inst::CondBr {
                    taken: CondBrTarget::Label(fail_label),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::NotEqual,
                        rs1: e,
                        rs2: dst.to_reg(),
                    },
                }
                .emit(sink, emit_info, state);
                let store_value = if ty.bits() < 32 {
                    // reload value to t0.
                    Inst::Atomic {
                        op: AtomicOP::load_op(ty),
                        rd: t0,
                        addr,
                        src: zero_reg(),
                        amo: AMO::SeqCst,
                    }
                    .emit(sink, emit_info, state);
                    // set reset part.
                    AtomicOP::merge(t0, writable_spilltmp_reg(), offset, v, ty)
                        .iter()
                        .for_each(|i| i.emit(sink, emit_info, state));
                    t0.to_reg()
                } else {
                    v
                };
                Inst::Atomic {
                    op: AtomicOP::store_op(ty),
                    rd: t0,
                    addr,
                    src: store_value,
                    amo: AMO::SeqCst,
                }
                .emit(sink, emit_info, state);
                // check is our value stored.
                Inst::CondBr {
                    taken: CondBrTarget::Label(cas_lebel),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::NotEqual,
                        rs1: t0.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(sink, emit_info, state);
                sink.bind_label(fail_label, &mut state.ctrl_plane);
            }
            &Inst::AtomicRmwLoop {
                offset,
                op,
                dst,
                ty,
                p,
                x,
                t0,
            } => {
                let retry = sink.get_label();
                sink.bind_label(retry, &mut state.ctrl_plane);
                // load old value.
                Inst::Atomic {
                    op: AtomicOP::load_op(ty),
                    rd: dst,
                    addr: p,
                    src: zero_reg(),
                    amo: AMO::SeqCst,
                }
                .emit(sink, emit_info, state);
                //

                let store_value: Reg = match op {
                    crate::ir::AtomicRmwOp::Add
                    | crate::ir::AtomicRmwOp::Sub
                    | crate::ir::AtomicRmwOp::And
                    | crate::ir::AtomicRmwOp::Or
                    | crate::ir::AtomicRmwOp::Xor => {
                        AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                            .iter()
                            .for_each(|i| i.emit(sink, emit_info, state));
                        Inst::AluRRR {
                            alu_op: match op {
                                crate::ir::AtomicRmwOp::Add => AluOPRRR::Add,
                                crate::ir::AtomicRmwOp::Sub => AluOPRRR::Sub,
                                crate::ir::AtomicRmwOp::And => AluOPRRR::And,
                                crate::ir::AtomicRmwOp::Or => AluOPRRR::Or,
                                crate::ir::AtomicRmwOp::Xor => AluOPRRR::Xor,
                                _ => unreachable!(),
                            },
                            rd: t0,
                            rs1: dst.to_reg(),
                            rs2: x,
                        }
                        .emit(sink, emit_info, state);
                        Inst::Atomic {
                            op: AtomicOP::load_op(ty),
                            rd: writable_spilltmp_reg2(),
                            addr: p,
                            src: zero_reg(),
                            amo: AMO::SeqCst,
                        }
                        .emit(sink, emit_info, state);
                        AtomicOP::merge(
                            writable_spilltmp_reg2(),
                            writable_spilltmp_reg(),
                            offset,
                            t0.to_reg(),
                            ty,
                        )
                        .iter()
                        .for_each(|i| i.emit(sink, emit_info, state));
                        spilltmp_reg2()
                    }
                    crate::ir::AtomicRmwOp::Nand => {
                        if ty.bits() < 32 {
                            AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                                .iter()
                                .for_each(|i| i.emit(sink, emit_info, state));
                        }
                        Inst::AluRRR {
                            alu_op: AluOPRRR::And,
                            rd: t0,
                            rs1: x,
                            rs2: dst.to_reg(),
                        }
                        .emit(sink, emit_info, state);
                        Inst::construct_bit_not(t0, t0.to_reg()).emit(sink, emit_info, state);
                        if ty.bits() < 32 {
                            Inst::Atomic {
                                op: AtomicOP::load_op(ty),
                                rd: writable_spilltmp_reg2(),
                                addr: p,
                                src: zero_reg(),
                                amo: AMO::SeqCst,
                            }
                            .emit(sink, emit_info, state);
                            AtomicOP::merge(
                                writable_spilltmp_reg2(),
                                writable_spilltmp_reg(),
                                offset,
                                t0.to_reg(),
                                ty,
                            )
                            .iter()
                            .for_each(|i| i.emit(sink, emit_info, state));
                            spilltmp_reg2()
                        } else {
                            t0.to_reg()
                        }
                    }

                    crate::ir::AtomicRmwOp::Umin
                    | crate::ir::AtomicRmwOp::Umax
                    | crate::ir::AtomicRmwOp::Smin
                    | crate::ir::AtomicRmwOp::Smax => {
                        let label_select_dst = sink.get_label();
                        let label_select_done = sink.get_label();
                        if op == crate::ir::AtomicRmwOp::Umin || op == crate::ir::AtomicRmwOp::Umax
                        {
                            AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                        } else {
                            AtomicOP::extract_sext(dst, offset, dst.to_reg(), ty)
                        }
                        .iter()
                        .for_each(|i| i.emit(sink, emit_info, state));

                        Inst::CondBr {
                            taken: CondBrTarget::Label(label_select_dst),
                            not_taken: CondBrTarget::Fallthrough,
                            kind: IntegerCompare {
                                kind: match op {
                                    crate::ir::AtomicRmwOp::Umin => IntCC::UnsignedLessThan,
                                    crate::ir::AtomicRmwOp::Umax => IntCC::UnsignedGreaterThan,
                                    crate::ir::AtomicRmwOp::Smin => IntCC::SignedLessThan,
                                    crate::ir::AtomicRmwOp::Smax => IntCC::SignedGreaterThan,
                                    _ => unreachable!(),
                                },
                                rs1: dst.to_reg(),
                                rs2: x,
                            },
                        }
                        .emit(sink, emit_info, state);
                        // here we select x.
                        Inst::gen_move(t0, x, I64).emit(sink, emit_info, state);
                        Inst::gen_jump(label_select_done).emit(sink, emit_info, state);
                        sink.bind_label(label_select_dst, &mut state.ctrl_plane);
                        Inst::gen_move(t0, dst.to_reg(), I64).emit(sink, emit_info, state);
                        sink.bind_label(label_select_done, &mut state.ctrl_plane);
                        Inst::Atomic {
                            op: AtomicOP::load_op(ty),
                            rd: writable_spilltmp_reg2(),
                            addr: p,
                            src: zero_reg(),
                            amo: AMO::SeqCst,
                        }
                        .emit(sink, emit_info, state);
                        AtomicOP::merge(
                            writable_spilltmp_reg2(),
                            writable_spilltmp_reg(),
                            offset,
                            t0.to_reg(),
                            ty,
                        )
                        .iter()
                        .for_each(|i| i.emit(sink, emit_info, state));
                        spilltmp_reg2()
                    }
                    crate::ir::AtomicRmwOp::Xchg => {
                        AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                            .iter()
                            .for_each(|i| i.emit(sink, emit_info, state));
                        Inst::Atomic {
                            op: AtomicOP::load_op(ty),
                            rd: writable_spilltmp_reg2(),
                            addr: p,
                            src: zero_reg(),
                            amo: AMO::SeqCst,
                        }
                        .emit(sink, emit_info, state);
                        AtomicOP::merge(
                            writable_spilltmp_reg2(),
                            writable_spilltmp_reg(),
                            offset,
                            x,
                            ty,
                        )
                        .iter()
                        .for_each(|i| i.emit(sink, emit_info, state));
                        spilltmp_reg2()
                    }
                };

                Inst::Atomic {
                    op: AtomicOP::store_op(ty),
                    rd: t0,
                    addr: p,
                    src: store_value,
                    amo: AMO::SeqCst,
                }
                .emit(sink, emit_info, state);

                // if store is not ok,retry.
                Inst::CondBr {
                    taken: CondBrTarget::Label(retry),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::NotEqual,
                        rs1: t0.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(sink, emit_info, state);
            }

            &Inst::LoadExtName {
                rd,
                ref name,
                offset,
            } => {
                if emit_info.shared_flag.is_pic() {
                    // Load a PC-relative address into a register.
                    // RISC-V does this slightly differently from other arches. We emit a relocation
                    // with a label, instead of the symbol itself.
                    //
                    // See: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#pc-relative-symbol-addresses
                    //
                    // Emit the following code:
                    // label:
                    //   auipc rd, 0              # R_RISCV_GOT_HI20 (symbol_name)
                    //   ld    rd, rd, 0          # R_RISCV_PCREL_LO12_I (label)

                    // Create the label that is going to be published to the final binary object.
                    let auipc_label = sink.get_label();
                    sink.bind_label(auipc_label, &mut state.ctrl_plane);

                    // Get the current PC.
                    sink.add_reloc(Reloc::RiscvGotHi20, &**name, 0);
                    Inst::Auipc {
                        rd: rd,
                        imm: Imm20::from_i32(0),
                    }
                    .emit_uncompressed(sink, emit_info, state, start_off);

                    // The `ld` here, points to the `auipc` label instead of directly to the symbol.
                    sink.add_reloc(Reloc::RiscvPCRelLo12I, &auipc_label, 0);
                    Inst::Load {
                        rd,
                        op: LoadOP::Ld,
                        flags: MemFlags::trusted(),
                        from: AMode::RegOffset(rd.to_reg(), 0),
                    }
                    .emit_uncompressed(sink, emit_info, state, start_off);
                } else {
                    // In the non PIC sequence we relocate the absolute address into
                    // a prealocatted space, load it into a register and jump over it.
                    //
                    // Emit the following code:
                    //   ld rd, label_data
                    //   j label_end
                    // label_data:
                    //   <8 byte space>           # ABS8
                    // label_end:

                    let label_data = sink.get_label();
                    let label_end = sink.get_label();

                    // Load the value from a label
                    Inst::Load {
                        rd,
                        op: LoadOP::Ld,
                        flags: MemFlags::trusted(),
                        from: AMode::Label(label_data),
                    }
                    .emit(sink, emit_info, state);

                    // Jump over the data
                    Inst::gen_jump(label_end).emit(sink, emit_info, state);

                    sink.bind_label(label_data, &mut state.ctrl_plane);
                    sink.add_reloc(Reloc::Abs8, name.as_ref(), offset);
                    sink.put8(0);

                    sink.bind_label(label_end, &mut state.ctrl_plane);
                }
            }

            &Inst::ElfTlsGetAddr { rd, ref name } => {
                // RISC-V's TLS GD model is slightly different from other arches.
                //
                // We have a relocation (R_RISCV_TLS_GD_HI20) that loads the high 20 bits
                // of the address relative to the GOT entry. This relocation points to
                // the symbol as usual.
                //
                // However when loading the bottom 12bits of the address, we need to
                // use a label that points to the previous AUIPC instruction.
                //
                // label:
                //    auipc a0,0                    # R_RISCV_TLS_GD_HI20 (symbol)
                //    addi  a0,a0,0                 # R_RISCV_PCREL_LO12_I (label)
                //
                // https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#global-dynamic

                // Create the label that is going to be published to the final binary object.
                let auipc_label = sink.get_label();
                sink.bind_label(auipc_label, &mut state.ctrl_plane);

                // Get the current PC.
                sink.add_reloc(Reloc::RiscvTlsGdHi20, &**name, 0);
                Inst::Auipc {
                    rd: rd,
                    imm: Imm20::from_i32(0),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // The `addi` here, points to the `auipc` label instead of directly to the symbol.
                sink.add_reloc(Reloc::RiscvPCRelLo12I, &auipc_label, 0);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Addi,
                    rd: rd,
                    rs: rd.to_reg(),
                    imm12: Imm12::from_i16(0),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                Inst::Call {
                    info: Box::new(CallInfo::empty(
                        ExternalName::LibCall(LibCall::ElfTlsGetAddr),
                        CallConv::SystemV,
                    )),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);
            }

            &Inst::TrapIf {
                rs1,
                rs2,
                cc,
                trap_code,
            } => {
                let label_end = sink.get_label();
                let cond = IntegerCompare { kind: cc, rs1, rs2 };

                // Jump over the trap if we the condition is false.
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_end),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: cond.inverse(),
                }
                .emit(sink, emit_info, state);
                Inst::Udf { trap_code }.emit(sink, emit_info, state);

                sink.bind_label(label_end, &mut state.ctrl_plane);
            }
            &Inst::Udf { trap_code } => {
                sink.add_trap(trap_code);
                sink.put_data(Inst::TRAP_OPCODE);
            }
            &Inst::AtomicLoad { rd, ty, p } => {
                // emit the fence.
                Inst::Fence {
                    pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                    succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                }
                .emit(sink, emit_info, state);
                // load.
                Inst::Load {
                    rd: rd,
                    op: LoadOP::from_type(ty),
                    flags: MemFlags::new(),
                    from: AMode::RegOffset(p, 0),
                }
                .emit(sink, emit_info, state);
                Inst::Fence {
                    pred: Inst::FENCE_REQ_R,
                    succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                }
                .emit(sink, emit_info, state);
            }
            &Inst::AtomicStore { src, ty, p } => {
                Inst::Fence {
                    pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                    succ: Inst::FENCE_REQ_W,
                }
                .emit(sink, emit_info, state);
                Inst::Store {
                    to: AMode::RegOffset(p, 0),
                    op: StoreOP::from_type(ty),
                    flags: MemFlags::new(),
                    src,
                }
                .emit(sink, emit_info, state);
            }

            &Inst::Popcnt {
                sum,
                tmp,
                step,
                rs,
                ty,
            } => {
                // load 0 to sum , init.
                Inst::gen_move(sum, zero_reg(), I64).emit(sink, emit_info, state);
                // load
                Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16))
                    .emit(sink, emit_info, state);
                //
                Inst::load_imm12(tmp, Imm12::ONE).emit(sink, emit_info, state);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp,
                    rs: tmp.to_reg(),
                    imm12: Imm12::from_i16((ty.bits() - 1) as i16),
                }
                .emit(sink, emit_info, state);
                let label_done = sink.get_label();
                let label_loop = sink.get_label();
                sink.bind_label(label_loop, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::SignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(sink, emit_info, state);
                // test and add sum.
                {
                    Inst::AluRRR {
                        alu_op: AluOPRRR::And,
                        rd: writable_spilltmp_reg2(),
                        rs1: tmp.to_reg(),
                        rs2: rs,
                    }
                    .emit(sink, emit_info, state);
                    let label_over = sink.get_label();
                    Inst::CondBr {
                        taken: CondBrTarget::Label(label_over),
                        not_taken: CondBrTarget::Fallthrough,
                        kind: IntegerCompare {
                            kind: IntCC::Equal,
                            rs1: zero_reg(),
                            rs2: spilltmp_reg2(),
                        },
                    }
                    .emit(sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: sum,
                        rs: sum.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(sink, emit_info, state);
                    sink.bind_label(label_over, &mut state.ctrl_plane);
                }
                // set step and tmp.
                {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: step,
                        rs: step.to_reg(),
                        imm12: Imm12::from_i16(-1),
                    }
                    .emit(sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Srli,
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(sink, emit_info, state);
                    Inst::gen_jump(label_loop).emit(sink, emit_info, state);
                }
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::Cltz {
                sum,
                tmp,
                step,
                rs,
                leading,
                ty,
            } => {
                // load 0 to sum , init.
                Inst::gen_move(sum, zero_reg(), I64).emit(sink, emit_info, state);
                // load
                Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16))
                    .emit(sink, emit_info, state);
                //
                Inst::load_imm12(tmp, Imm12::ONE).emit(sink, emit_info, state);
                if leading {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Slli,
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::from_i16((ty.bits() - 1) as i16),
                    }
                    .emit(sink, emit_info, state);
                }
                let label_done = sink.get_label();
                let label_loop = sink.get_label();
                sink.bind_label(label_loop, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::SignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(sink, emit_info, state);
                // test and add sum.
                {
                    Inst::AluRRR {
                        alu_op: AluOPRRR::And,
                        rd: writable_spilltmp_reg2(),
                        rs1: tmp.to_reg(),
                        rs2: rs,
                    }
                    .emit(sink, emit_info, state);
                    Inst::CondBr {
                        taken: CondBrTarget::Label(label_done),
                        not_taken: CondBrTarget::Fallthrough,
                        kind: IntegerCompare {
                            kind: IntCC::NotEqual,
                            rs1: zero_reg(),
                            rs2: spilltmp_reg2(),
                        },
                    }
                    .emit(sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: sum,
                        rs: sum.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(sink, emit_info, state);
                }
                // set step and tmp.
                {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: step,
                        rs: step.to_reg(),
                        imm12: Imm12::from_i16(-1),
                    }
                    .emit(sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: if leading {
                            AluOPRRI::Srli
                        } else {
                            AluOPRRI::Slli
                        },
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(sink, emit_info, state);
                    Inst::gen_jump(label_loop).emit(sink, emit_info, state);
                }
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::Brev8 {
                rs,
                ty,
                step,
                tmp,
                tmp2,
                rd,
            } => {
                Inst::gen_move(rd, zero_reg(), I64).emit(sink, emit_info, state);
                Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16))
                    .emit(sink, emit_info, state);
                //
                Inst::load_imm12(tmp, Imm12::ONE).emit(sink, emit_info, state);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp,
                    rs: tmp.to_reg(),
                    imm12: Imm12::from_i16((ty.bits() - 1) as i16),
                }
                .emit(sink, emit_info, state);
                Inst::load_imm12(tmp2, Imm12::ONE).emit(sink, emit_info, state);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp2,
                    rs: tmp2.to_reg(),
                    imm12: Imm12::from_i16((ty.bits() - 8) as i16),
                }
                .emit(sink, emit_info, state);

                let label_done = sink.get_label();
                let label_loop = sink.get_label();
                sink.bind_label(label_loop, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::SignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(sink, emit_info, state);
                // test and set bit.
                {
                    Inst::AluRRR {
                        alu_op: AluOPRRR::And,
                        rd: writable_spilltmp_reg2(),
                        rs1: tmp.to_reg(),
                        rs2: rs,
                    }
                    .emit(sink, emit_info, state);
                    let label_over = sink.get_label();
                    Inst::CondBr {
                        taken: CondBrTarget::Label(label_over),
                        not_taken: CondBrTarget::Fallthrough,
                        kind: IntegerCompare {
                            kind: IntCC::Equal,
                            rs1: zero_reg(),
                            rs2: spilltmp_reg2(),
                        },
                    }
                    .emit(sink, emit_info, state);
                    Inst::AluRRR {
                        alu_op: AluOPRRR::Or,
                        rd: rd,
                        rs1: rd.to_reg(),
                        rs2: tmp2.to_reg(),
                    }
                    .emit(sink, emit_info, state);
                    sink.bind_label(label_over, &mut state.ctrl_plane);
                }
                // set step and tmp.
                {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: step,
                        rs: step.to_reg(),
                        imm12: Imm12::from_i16(-1),
                    }
                    .emit(sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Srli,
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(sink, emit_info, state);
                    {
                        // reset tmp2
                        // if (step %=8 == 0) then tmp2 = tmp2 >> 15
                        // if (step %=8 != 0) then tmp2 = tmp2 << 1
                        let label_over = sink.get_label();
                        let label_sll_1 = sink.get_label();
                        Inst::load_imm12(writable_spilltmp_reg2(), Imm12::from_i16(8))
                            .emit(sink, emit_info, state);
                        Inst::AluRRR {
                            alu_op: AluOPRRR::Rem,
                            rd: writable_spilltmp_reg2(),
                            rs1: step.to_reg(),
                            rs2: spilltmp_reg2(),
                        }
                        .emit(sink, emit_info, state);
                        Inst::CondBr {
                            taken: CondBrTarget::Label(label_sll_1),
                            not_taken: CondBrTarget::Fallthrough,
                            kind: IntegerCompare {
                                kind: IntCC::NotEqual,
                                rs1: spilltmp_reg2(),
                                rs2: zero_reg(),
                            },
                        }
                        .emit(sink, emit_info, state);
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Srli,
                            rd: tmp2,
                            rs: tmp2.to_reg(),
                            imm12: Imm12::from_i16(15),
                        }
                        .emit(sink, emit_info, state);
                        Inst::gen_jump(label_over).emit(sink, emit_info, state);
                        sink.bind_label(label_sll_1, &mut state.ctrl_plane);
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Slli,
                            rd: tmp2,
                            rs: tmp2.to_reg(),
                            imm12: Imm12::ONE,
                        }
                        .emit(sink, emit_info, state);
                        sink.bind_label(label_over, &mut state.ctrl_plane);
                    }
                    Inst::gen_jump(label_loop).emit(sink, emit_info, state);
                }
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::StackProbeLoop {
                guard_size,
                probe_count,
                tmp: guard_size_tmp,
            } => {
                let step = writable_spilltmp_reg();
                Inst::load_constant_u64(step, (guard_size as u64) * (probe_count as u64))
                    .iter()
                    .for_each(|i| i.emit(sink, emit_info, state));
                Inst::load_constant_u64(guard_size_tmp, guard_size as u64)
                    .iter()
                    .for_each(|i| i.emit(sink, emit_info, state));

                let loop_start = sink.get_label();
                let label_done = sink.get_label();
                sink.bind_label(loop_start, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::UnsignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: guard_size_tmp.to_reg(),
                    },
                }
                .emit(sink, emit_info, state);
                // compute address.
                Inst::AluRRR {
                    alu_op: AluOPRRR::Sub,
                    rd: writable_spilltmp_reg2(),
                    rs1: stack_reg(),
                    rs2: step.to_reg(),
                }
                .emit(sink, emit_info, state);
                Inst::Store {
                    to: AMode::RegOffset(spilltmp_reg2(), 0),
                    op: StoreOP::Sb,
                    flags: MemFlags::new(),
                    src: zero_reg(),
                }
                .emit(sink, emit_info, state);
                // reset step.
                Inst::AluRRR {
                    alu_op: AluOPRRR::Sub,
                    rd: step,
                    rs1: step.to_reg(),
                    rs2: guard_size_tmp.to_reg(),
                }
                .emit(sink, emit_info, state);
                Inst::gen_jump(loop_start).emit(sink, emit_info, state);
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::VecAluRRRImm5 {
                op,
                vd,
                vd_src,
                imm,
                vs2,
                ref mask,
                ..
            } => {
                debug_assert_eq!(vd.to_reg(), vd_src);

                sink.put4(encode_valu_rrr_imm(op, vd, imm, vs2, *mask));
            }
            &Inst::VecAluRRRR {
                op,
                vd,
                vd_src,
                vs1,
                vs2,
                ref mask,
                ..
            } => {
                debug_assert_eq!(vd.to_reg(), vd_src);

                sink.put4(encode_valu_rrrr(op, vd, vs2, vs1, *mask));
            }
            &Inst::VecAluRRR {
                op,
                vd,
                vs1,
                vs2,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu(op, vd, vs1, vs2, *mask));
            }
            &Inst::VecAluRRImm5 {
                op,
                vd,
                imm,
                vs2,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu_rr_imm(op, vd, imm, vs2, *mask));
            }
            &Inst::VecAluRR {
                op,
                vd,
                vs,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu_rr(op, vd, vs, *mask));
            }
            &Inst::VecAluRImm5 {
                op,
                vd,
                imm,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu_r_imm(op, vd, imm, *mask));
            }
            &Inst::VecSetState { rd, ref vstate } => {
                sink.put4(encode_vcfg_imm(
                    0x57,
                    rd.to_reg(),
                    vstate.avl.unwrap_static(),
                    &vstate.vtype,
                ));

                // Update the current vector emit state.
                state.vstate = EmitVState::Known(*vstate);
            }

            &Inst::VecLoad {
                eew,
                to,
                ref from,
                ref mask,
                flags,
                ..
            } => {
                // Vector Loads don't support immediate offsets, so we need to load it into a register.
                let addr = match from {
                    VecAMode::UnitStride { base } => {
                        let base_reg = base.get_base_register();
                        let offset = base.get_offset_with_state(state);

                        // Reg+0 Offset can be directly encoded
                        if let (Some(base_reg), 0) = (base_reg, offset) {
                            base_reg
                        } else {
                            // Otherwise load the address it into a reg and load from it.
                            let tmp = writable_spilltmp_reg();
                            Inst::LoadAddr {
                                rd: tmp,
                                mem: *base,
                            }
                            .emit(sink, emit_info, state);
                            tmp.to_reg()
                        }
                    }
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }

                sink.put4(encode_vmem_load(
                    0x07,
                    to.to_reg(),
                    eew,
                    addr,
                    from.lumop(),
                    *mask,
                    from.mop(),
                    from.nf(),
                ));
            }

            &Inst::VecStore {
                eew,
                ref to,
                from,
                ref mask,
                flags,
                ..
            } => {
                // Vector Stores don't support immediate offsets, so we need to load it into a register.
                let addr = match to {
                    VecAMode::UnitStride { base } => {
                        let base_reg = base.get_base_register();
                        let offset = base.get_offset_with_state(state);

                        // Reg+0 Offset can be directly encoded
                        if let (Some(base_reg), 0) = (base_reg, offset) {
                            base_reg
                        } else {
                            // Otherwise load the address it into a reg and load from it.
                            let tmp = writable_spilltmp_reg();
                            Inst::LoadAddr {
                                rd: tmp,
                                mem: *base,
                            }
                            .emit(sink, emit_info, state);
                            tmp.to_reg()
                        }
                    }
                };

                if let Some(trap_code) = flags.trap_code() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(trap_code);
                }

                sink.put4(encode_vmem_store(
                    0x27,
                    from,
                    eew,
                    addr,
                    to.sumop(),
                    *mask,
                    to.mop(),
                    to.nf(),
                ));
            }
        };
    }
}

fn emit_return_call_common_sequence<T>(
    sink: &mut MachBuffer<Inst>,
    emit_info: &EmitInfo,
    state: &mut EmitState,
    info: &ReturnCallInfo<T>,
) {
    // The return call sequence can potentially emit a lot of instructions (up to 634 bytes!)
    // So lets emit an island here if we need it.
    //
    // It is difficult to calculate exactly how many instructions are going to be emitted, so
    // we calculate it by emitting it into a disposable buffer, and then checking how many instructions
    // were actually emitted.
    let mut buffer = MachBuffer::new();
    let mut fake_emit_state = state.clone();

    return_call_emit_impl(&mut buffer, emit_info, &mut fake_emit_state, info);

    // Finalize the buffer and get the number of bytes emitted.
    let buffer = buffer.finish(&Default::default(), &mut Default::default());
    let length = buffer.data().len() as u32;

    // And now emit the island inline with this instruction.
    if sink.island_needed(length) {
        let jump_around_label = sink.get_label();
        Inst::gen_jump(jump_around_label).emit(sink, emit_info, state);
        sink.emit_island(length + 4, &mut state.ctrl_plane);
        sink.bind_label(jump_around_label, &mut state.ctrl_plane);
    }

    // Now that we're done, emit the *actual* return sequence.
    return_call_emit_impl(sink, emit_info, state, info);
}

/// This should not be called directly, Instead prefer to call [emit_return_call_common_sequence].
fn return_call_emit_impl<T>(
    sink: &mut MachBuffer<Inst>,
    emit_info: &EmitInfo,
    state: &mut EmitState,
    info: &ReturnCallInfo<T>,
) {
    let sp_to_fp_offset = {
        let frame_layout = state.frame_layout();
        i64::from(
            frame_layout.clobber_size
                + frame_layout.fixed_frame_storage_size
                + frame_layout.outgoing_args_size,
        )
    };

    let mut clobber_offset = sp_to_fp_offset - 8;
    for reg in state.frame_layout().clobbered_callee_saves.clone() {
        let rreg = reg.to_reg();
        let ty = match rreg.class() {
            RegClass::Int => I64,
            RegClass::Float => F64,
            RegClass::Vector => unimplemented!("Vector Clobber Restores"),
        };

        Inst::gen_load(
            reg.map(Reg::from),
            AMode::SPOffset(clobber_offset),
            ty,
            MemFlags::trusted(),
        )
        .emit(sink, emit_info, state);

        clobber_offset -= 8
    }

    // Restore the link register and frame pointer
    let setup_area_size = i64::from(state.frame_layout().setup_area_size);
    if setup_area_size > 0 {
        Inst::gen_load(
            writable_link_reg(),
            AMode::SPOffset(sp_to_fp_offset + 8),
            I64,
            MemFlags::trusted(),
        )
        .emit(sink, emit_info, state);

        Inst::gen_load(
            writable_fp_reg(),
            AMode::SPOffset(sp_to_fp_offset),
            I64,
            MemFlags::trusted(),
        )
        .emit(sink, emit_info, state);
    }

    // If we over-allocated the incoming args area in the prologue, resize down to what the callee
    // is expecting.
    let incoming_args_diff =
        i64::from(state.frame_layout().tail_args_size - info.new_stack_arg_size);

    // Increment SP all at once
    let sp_increment = sp_to_fp_offset + setup_area_size + incoming_args_diff;
    if sp_increment > 0 {
        for inst in Riscv64MachineDeps::gen_sp_reg_adjust(i32::try_from(sp_increment).unwrap()) {
            inst.emit(sink, emit_info, state);
        }
    }
}