;; x86-64 instruction selection and CLIF-to-MachInst lowering.

;; The main lowering constructor term: takes a clif `Inst` and returns the
;; register(s) within which the lowered instruction's result values live.
(spec (lower arg)
      (provide (= result arg)))
(decl partial lower (Inst) InstOutput)

;; A variant of the main lowering constructor term, used for branches.
;; The only difference is that it gets an extra argument holding a vector
;; of branch targets to be used.
(decl partial lower_branch (Inst MachLabelSlice) Unit)

;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.
(rule (lower (has_type (fits_in_64 ty)
                       (iconst (u64_from_imm64 x))))
      (imm ty x))

;; `i128`
(rule 1 (lower (has_type $I128
                       (iconst (u64_from_imm64 x))))
      (value_regs (imm $I64 x)
                  (imm $I64 0)))

;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (f16const (u16_from_ieee16 x)))
      (imm $F16 x))

;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (f32const (u32_from_ieee32 x)))
      (imm $F32 x))

;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (f64const (u64_from_ieee64 x)))
      (imm $F64 x))

;;;; Rules for `f128const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (f128const const))
      ;; TODO use Inst::gen_constant() instead.
      (x64_xmm_load_const $F128 (const_to_vconst const)))

(rule 1 (lower (f128const (u128_from_constant 0)))
      (xmm_zero $F128))

;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

;; Base case for 8 and 16-bit types
(rule -6 (lower (has_type (fits_in_16 ty)
                       (iadd x y)))
      (x64_add ty x y))

;; Base case for 32 and 64-bit types which might end up using the `lea`
;; instruction to fold multiple operations into one.
;;
;; Note that at this time this always generates a `lea` pseudo-instruction,
;; but the actual instruction emitted might be an `add` if it's equivalent.
;; For more details on this see the `emit.rs` logic to emit
;; `LoadEffectiveAddress`.
(rule iadd_base_case_32_or_64_lea -5 (lower (has_type (ty_32_or_64 ty) (iadd x y)))
      (x64_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))

;; Higher-priority cases than the previous two where a load can be sunk into
;; the add instruction itself. Note that both operands are tested for
;; sink-ability since addition is commutative
(rule -4 (lower (has_type (fits_in_64 ty)
                       (iadd x (sinkable_load y))))
      (x64_add ty x y))
(rule -3 (lower (has_type (fits_in_64 ty)
                       (iadd (sinkable_load x) y)))
      (x64_add ty y x))

;; SSE.

(rule (lower (has_type (multi_lane 8 16)
                       (iadd x y)))
      (x64_paddb x y))

(rule (lower (has_type (multi_lane 16 8)
                       (iadd x y)))
      (x64_paddw x y))

(rule (lower (has_type (multi_lane 32 4)
                       (iadd x y)))
      (x64_paddd x y))

(rule (lower (has_type (multi_lane 64 2)
                       (iadd x y)))
      (x64_paddq x y))

;; `i128`
(rule 1 (lower (has_type $I128 (iadd x y)))
      ;; Get the high/low registers for `x`.
      (let ((x_regs ValueRegs x)
            (y_regs ValueRegs y))
        (iadd128
          (value_regs_get_gpr x_regs 0)
          (value_regs_get_gpr x_regs 1)
          (value_regs_get_gpr y_regs 0)
          (value_regs_get_gpr y_regs 1))))
(rule 2 (lower (has_type $I128 (iadd x (iconcat y_lo y_hi))))
        (let ((x_regs ValueRegs x))
          (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi)))
(rule 3 (lower (has_type $I128 (iadd x (uextend y @ (value_type $I64)))))
        (let ((x_regs ValueRegs x))
          (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1)
                   y (RegMemImm.Imm 0))))

;; Helper for lowering 128-bit addition with the 64-bit halves of the lhs/rhs
;; already split. The first two arguments are lo/hi for the lhs and the second
;; two are lo/hi for the rhs.
(decl iadd128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs)
(rule (iadd128 x_lo x_hi y_lo y_hi)
      (with_flags (x64_add_with_flags_paired $I64 x_lo y_lo)
                  (x64_adc_paired $I64 x_hi y_hi)))

;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(decl construct_overflow_op (CC ProducesFlags) InstOutput)
(rule (construct_overflow_op cc inst)
      (let ((results ValueRegs (with_flags inst
                                           (x64_setcc_paired cc))))
        (output_pair (value_regs_get results 0)
                     (value_regs_get results 1))))

(decl construct_overflow_op_alu (Type CC AluRmiROpcode Gpr GprMemImm) InstOutput)
(rule (construct_overflow_op_alu ty cc alu_op src1 src2)
      (construct_overflow_op cc (x64_alurmi_with_flags_paired alu_op ty src1 src2)))

;; This essentially creates
;; alu_<op1> x_lo, y_lo
;; alu_<op2> x_hi, y_hi
;; set<cc> r8
(decl construct_overflow_op_alu_128 (CC AluRmiROpcode AluRmiROpcode Value Value) InstOutput)
(rule (construct_overflow_op_alu_128 cc op1 op2 x y)
      ;; Get the high/low registers for `x`.
      (let ((x_regs ValueRegs x)
            (x_lo Gpr (value_regs_get_gpr x_regs 0))
            (x_hi Gpr (value_regs_get_gpr x_regs 1)))
        ;; Get the high/low registers for `y`.
        (let ((y_regs ValueRegs y)
              (y_lo Gpr (value_regs_get_gpr y_regs 0))
              (y_hi Gpr (value_regs_get_gpr y_regs 1)))
          (let    ((lo_inst ProducesFlags (x64_alurmi_with_flags_paired op1 $I64 x_lo y_lo))
                   (hi_inst ConsumesAndProducesFlags (x64_alurmi_with_flags_chained op2 $I64 x_hi y_hi))
                   (of_inst ConsumesFlags (x64_setcc_paired cc))

                   (result MultiReg (with_flags_chained lo_inst hi_inst of_inst)))
                  (multi_reg_to_pair_and_single result)))))

;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (uadd_overflow x y @ (value_type (fits_in_64 ty))))
      (construct_overflow_op_alu ty (CC.B) (AluRmiROpcode.Add) x y))

;; i128 gets lowered into adc and add
(rule 0 (lower (uadd_overflow x y @ (value_type $I128)))
        (construct_overflow_op_alu_128 (CC.B) (AluRmiROpcode.Add) (AluRmiROpcode.Adc) x y))

;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (sadd_overflow x y @ (value_type (fits_in_64 ty))))
      (construct_overflow_op_alu ty (CC.O) (AluRmiROpcode.Add) x y))

(rule 0 (lower (sadd_overflow x y @ (value_type $I128)))
        (construct_overflow_op_alu_128 (CC.O) (AluRmiROpcode.Add) (AluRmiROpcode.Adc) x y))

;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (usub_overflow x y @ (value_type (fits_in_64 ty))))
      (construct_overflow_op_alu ty (CC.B) (AluRmiROpcode.Sub) x y))

(rule 0 (lower (usub_overflow x y @ (value_type $I128)))
        (construct_overflow_op_alu_128 (CC.B) (AluRmiROpcode.Sub) (AluRmiROpcode.Sbb) x y))

;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (ssub_overflow x y @ (value_type (fits_in_64 ty))))
      (construct_overflow_op_alu ty (CC.O) (AluRmiROpcode.Sub) x y))

(rule 0 (lower (ssub_overflow x y @ (value_type $I128)))
        (construct_overflow_op_alu_128 (CC.O) (AluRmiROpcode.Sub) (AluRmiROpcode.Sbb) x y))

;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 2 (lower (umul_overflow x y @ (value_type $I8)))
      (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired false x y)))

(rule 3 (lower (umul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty))))
      (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty false x y)))

;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 2 (lower (smul_overflow x y @ (value_type $I8)))
      (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired true x y)))

(rule 3 (lower (smul_overflow x y @ (value_type (ty_int_ref_16_to_64 ty))))
      (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty true x y)))

;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16)
                       (sadd_sat x y)))
      (x64_paddsb x y))

(rule (lower (has_type (multi_lane 16 8)
                       (sadd_sat x y)))
      (x64_paddsw x y))

;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16)
                       (uadd_sat x y)))
      (x64_paddusb x y))

(rule (lower (has_type (multi_lane 16 8)
                       (uadd_sat x y)))
      (x64_paddusw x y))

;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

;; Sub two registers.
(rule -3 (lower (has_type (fits_in_64 ty)
                       (isub x y)))
      (x64_sub ty x y))

;; SSE.

(rule (lower (has_type (multi_lane 8 16)
                       (isub x y)))
      (x64_psubb x y))

(rule (lower (has_type (multi_lane 16 8)
                       (isub x y)))
      (x64_psubw x y))

(rule (lower (has_type (multi_lane 32 4)
                       (isub x y)))
      (x64_psubd x y))

(rule (lower (has_type (multi_lane 64 2)
                       (isub x y)))
      (x64_psubq x y))

;; `i128`
(rule 1 (lower (has_type $I128 (isub x y)))
      ;; Get the high/low registers for `x`.
      (let ((x_regs ValueRegs x)
            (y_regs ValueRegs y))
        (isub128
          (value_regs_get_gpr x_regs 0)
          (value_regs_get_gpr x_regs 1)
          (value_regs_get_gpr y_regs 0)
          (value_regs_get_gpr y_regs 1))))
(rule 2 (lower (has_type $I128 (isub x (iconcat y_lo y_hi))))
        (let ((x_regs ValueRegs x))
          (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi)))
(rule 3 (lower (has_type $I128 (isub x (uextend y @ (value_type $I64)))))
        (let ((x_regs ValueRegs x))
          (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1)
                   y (RegMemImm.Imm 0))))

;; Helper for lowering 128-bit subtraction with the 64-bit halves of the lhs/rhs
;; already split. The first two arguments are lo/hi for the lhs and the second
;; two are lo/hi for the rhs.
(decl isub128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs)
(rule (isub128 x_lo x_hi y_lo y_hi)
      (with_flags (x64_sub_with_flags_paired $I64 x_lo y_lo)
                  (x64_sbb_paired $I64 x_hi y_hi)))

;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16)
                       (ssub_sat x y)))
      (x64_psubsb x y))

(rule (lower (has_type (multi_lane 16 8)
                       (ssub_sat x y)))
      (x64_psubsw x y))

;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16)
                       (usub_sat x y)))
      (x64_psubusb x y))

(rule (lower (has_type (multi_lane 16 8)
                       (usub_sat x y)))
      (x64_psubusw x y))

;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `{i,b}64` and smaller.

;; And two registers.
(rule 0 (lower (has_type ty (band x y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_and ty x y))

;; The above case automatically handles when the rhs is an immediate or a
;; sinkable load, but additionally handle the lhs here.

(rule 1 (lower (has_type ty (band (sinkable_load x) y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_and ty y x))

(rule 2 (lower (has_type ty (band (simm32_from_value x) y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_and ty y x))

;; f32 and f64

(rule 5 (lower (has_type (ty_scalar_float ty) (band x y)))
      (sse_and ty x y))

;; SSE.

(decl sse_and (Type Xmm XmmMem) Xmm)
(rule (sse_and $F32X4 x y) (x64_andps x y))
(rule (sse_and $F64X2 x y) (x64_andpd x y))
(rule (sse_and $F32 x y) (x64_andps x y))
(rule (sse_and $F64 x y) (x64_andpd x y))
(rule -1 (sse_and (multi_lane _bits _lanes) x y) (x64_pand x y))

(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes)
                       (band x y)))
      (sse_and ty x y))

;; `i128`.

(decl and_i128 (ValueRegs ValueRegs) ValueRegs)
(rule (and_i128 x y)
      (let ((x_regs ValueRegs x)
            (x_lo Gpr (value_regs_get_gpr x_regs 0))
            (x_hi Gpr (value_regs_get_gpr x_regs 1))
            (y_regs ValueRegs y)
            (y_lo Gpr (value_regs_get_gpr y_regs 0))
            (y_hi Gpr (value_regs_get_gpr y_regs 1)))
        (value_gprs (x64_and $I64 x_lo y_lo)
                    (x64_and $I64 x_hi y_hi))))

(rule 7 (lower (has_type $I128 (band x y)))
      (and_i128 x y))

;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
;; by Cranelift's `band_not` instruction that is legalized into the simpler
;; forms early on.

(decl sse_and_not (Type Xmm XmmMem) Xmm)
(rule (sse_and_not $F32X4 x y) (x64_andnps x y))
(rule (sse_and_not $F64X2 x y) (x64_andnpd x y))
(rule -1 (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y))

;; Note the flipping of operands below as we're match
;;
;;   (band x (bnot y))
;;
;; while x86 does
;;
;;   pandn(x, y) = and(not(x), y)
(rule 8 (lower (has_type ty @ (multi_lane _bits _lane) (band x (bnot y))))
      (sse_and_not ty y x))
(rule 9 (lower (has_type ty @ (multi_lane _bits _lane) (band (bnot y) x)))
      (sse_and_not ty y x))

(rule 10 (lower (has_type ty (band x (bnot y))))
      (if (ty_int_ref_scalar_64 ty))
      (if-let true (use_bmi1))
      ;; the first argument is the one that gets inverted with andn
      (x64_andn ty y x))
(rule 11 (lower (has_type ty (band (bnot y) x)))
      (if (ty_int_ref_scalar_64 ty))
      (if-let true (use_bmi1))
      (x64_andn ty y x))

;; Specialization of `blsr` for BMI1

(decl pure partial val_minus_one (Value) Value)
(rule 0 (val_minus_one (isub x (u64_from_iconst 1))) x)
(rule 0 (val_minus_one (iadd x (i64_from_iconst -1))) x)
(rule 1 (val_minus_one (iadd (i64_from_iconst -1) x)) x)

(rule 12 (lower (has_type (ty_32_or_64 ty) (band x y)))
         (if-let true (use_bmi1))
         (if-let x (val_minus_one y))
         (x64_blsr ty x))
(rule 13 (lower (has_type (ty_32_or_64 ty) (band y x)))
         (if-let true (use_bmi1))
         (if-let x (val_minus_one y))
         (x64_blsr ty x))

;; Specialization of `blsi` for BMI1

(rule 14 (lower (has_type (ty_32_or_64 ty) (band (ineg x) x)))
         (if-let true (use_bmi1))
         (x64_blsi ty x))
(rule 15 (lower (has_type (ty_32_or_64 ty) (band x (ineg x))))
         (if-let true (use_bmi1))
         (x64_blsi ty x))

;; Specialization of `bzhi` for BMI2
;;
;; The `bzhi` instruction clears all bits indexed by the second operand of the
;; first operand. This is pattern-matched here with a `band` against a mask
;; which is generated to be N bits large. Note that if the index is larger than
;; the bit-width of the type then `bzhi` doesn't have the same semantics as
;; `ishl`, so an `and` instruction is required to mask the index to match the
;; semantics of Cranelift's `ishl`.

(rule 16 (lower (has_type (ty_32_or_64 ty) (band x y)))
         (if-let true (use_bmi2))
         (if-let (ishl (u64_from_iconst 1) index) (val_minus_one y))
         (x64_bzhi ty x (x64_and ty index (RegMemImm.Imm (u32_sub (ty_bits ty) 1)))))

;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `{i,b}64` and smaller.

;; Or two registers.
(rule 0 (lower (has_type ty (bor x y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_or ty x y))

;; Handle immediates/sinkable loads on the lhs in addition to the automatic
;; handling of the rhs above

(rule 1 (lower (has_type ty (bor (sinkable_load x) y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_or ty y x))

(rule 2 (lower (has_type ty (bor (simm32_from_value x) y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_or ty y x))

;; f32 and f64

(rule 5 (lower (has_type (ty_scalar_float ty) (bor x y)))
      (sse_or ty x y))

;; SSE.

(decl sse_or (Type Xmm XmmMem) Xmm)
(rule (sse_or $F32X4 x y) (x64_orps x y))
(rule (sse_or $F64X2 x y) (x64_orpd x y))
(rule (sse_or $F32 x y) (x64_orps x y))
(rule (sse_or $F64 x y) (x64_orpd x y))
(rule -1 (sse_or (multi_lane _bits _lanes) x y) (x64_por x y))

(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes)
                       (bor x y)))
      (sse_or ty x y))

;; `{i,b}128`.

(decl or_i128 (ValueRegs ValueRegs) ValueRegs)
(rule (or_i128 x y)
      (let ((x_lo Gpr (value_regs_get_gpr x 0))
            (x_hi Gpr (value_regs_get_gpr x 1))
            (y_lo Gpr (value_regs_get_gpr y 0))
            (y_hi Gpr (value_regs_get_gpr y 1)))
        (value_gprs (x64_or $I64 x_lo y_lo)
                    (x64_or $I64 x_hi y_hi))))

(rule 7 (lower (has_type $I128 (bor x y)))
      (or_i128 x y))

;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `{i,b}64` and smaller.

;; Xor two registers.
(rule 0 (lower (has_type ty (bxor x y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_xor ty x y))

;; Handle xor with lhs immediates/sinkable loads in addition to the automatic
;; handling of the rhs above.

(rule 1 (lower (has_type ty (bxor (sinkable_load x) y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_xor ty y x))

(rule 4 (lower (has_type ty (bxor (simm32_from_value x) y)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_xor ty y x))

;; f32 and f64

(rule 5 (lower (has_type (ty_scalar_float ty) (bxor x y)))
      (x64_xor_vector ty x y))

;; SSE.

(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y)))
      (x64_xor_vector ty x y))

;; `{i,b}128`.

(rule 7 (lower (has_type $I128 (bxor x y)))
      (let ((x_regs ValueRegs x)
            (x_lo Gpr (value_regs_get_gpr x_regs 0))
            (x_hi Gpr (value_regs_get_gpr x_regs 1))
            (y_regs ValueRegs y)
            (y_lo Gpr (value_regs_get_gpr y_regs 0))
            (y_hi Gpr (value_regs_get_gpr y_regs 1)))
        (value_gprs (x64_xor $I64 x_lo y_lo)
                    (x64_xor $I64 x_hi y_hi))))

;; Specialization of `blsmsk` for BMI1

(rule 8 (lower (has_type (ty_32_or_64 ty) (bxor x y)))
        (if-let true (use_bmi1))
        (if-let x (val_minus_one y))
        (x64_blsmsk ty x))
(rule 9 (lower (has_type (ty_32_or_64 ty) (bxor y x)))
        (if-let true (use_bmi1))
        (if-let x (val_minus_one y))
        (x64_blsmsk ty x))

;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

(rule -1 (lower (has_type (fits_in_64 ty) (ishl src amt)))
      (x64_shl ty src (put_masked_in_imm8_gpr amt ty)))

;; `i128`.

(decl shl_i128 (ValueRegs Gpr) ValueRegs)
(rule (shl_i128 src amt)
      ;; Unpack the registers that make up the 128-bit value being shifted.
      (let ((src_lo Gpr (value_regs_get_gpr src 0))
            (src_hi Gpr (value_regs_get_gpr src 1))
            ;; Do two 64-bit shifts.
            (lo_shifted Gpr (x64_shl $I64 src_lo amt))
            (hi_shifted Gpr (x64_shl $I64 src_hi amt))
            ;; `src_lo >> (64 - amt)` are the bits to carry over from the lo
            ;; into the hi.
            (carry Gpr (x64_shr $I64
                            src_lo
                            (x64_sub $I64
                                 (imm $I64 64)
                                 amt)))
            (zero Gpr (imm $I64 0))
            ;; Nullify the carry if we are shifting in by a multiple of 128.
            (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64)
                                              amt
                                              (RegMemImm.Imm 127))
                                        (cmove $I64
                                               (CC.Z)
                                               zero
                                               carry)))
            ;; Add the carry into the high half.
            (hi_shifted_ Gpr (x64_or $I64 carry_ hi_shifted)))
        ;; Combine the two shifted halves. However, if we are shifting by >= 64
        ;; (modulo 128), then the low bits are zero and the high bits are our
        ;; low bits.
        (with_flags (x64_test (OperandSize.Size64) amt (RegMemImm.Imm 64))
                    (consumes_flags_concat
                     (cmove $I64 (CC.Z) lo_shifted zero)
                     (cmove $I64 (CC.Z) hi_shifted_ lo_shifted)))))

(rule (lower (has_type $I128 (ishl src amt)))
      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
      ;; amount to the value's bit width.
      (let ((amt_ Gpr (lo_gpr amt)))
        (shl_i128 src amt_)))

;; SSE.

;; Since the x86 instruction set does not have any 8x16 shift instructions (even
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
;; instructions. The basic idea, whether the amount to shift by is an immediate
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
(rule (lower (has_type ty @ $I8X16 (ishl src amt)))
      (let (
            ;; Mask the amount to ensure wrapping behaviour
            (masked_amt RegMemImm (mask_xmm_shift ty amt))
            ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
            ;; correct for half of the lanes; the others must be fixed up with
            ;; the mask below.
            (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
            (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
            (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
        (sse_and $I8X16 unmasked (RegMem.Reg mask))))

;; Get the address of the mask to use when fixing up the lanes that weren't
;; correctly generated by the 16x8 shift.
(decl ishl_i8x16_mask (RegMemImm) SyntheticAmode)

;; When the shift amount is known, we can statically (i.e. at compile time)
;; determine the mask to use and only emit that.
(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
      (ishl_i8x16_mask_for_const amt))

;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
;; time) find the correct mask offset in the table. We use `lea` to find the
;; base address of the mask table and then complex addressing to offset to the
;; right mask: `base_address + amt << 4`
(decl ishl_i8x16_mask_table () SyntheticAmode)
(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
      (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
            (base_mask_addr Gpr (x64_lea $I64 mask_table))
            (mask_offset Gpr (x64_shl $I64 amt
                                  (imm8_to_imm8_gpr 4))))
        (Amode.ImmRegRegShift 0
                              base_mask_addr
                              mask_offset
                              0
                              (mem_flags_trusted))))

(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
      (ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))

;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.

(rule (lower (has_type ty @ $I16X8 (ishl src amt)))
      (x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

(rule (lower (has_type ty @ $I32X4 (ishl src amt)))
      (x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

(rule (lower (has_type ty @ $I64X2 (ishl src amt)))
      (x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

(rule -1 (lower (has_type (fits_in_64 ty) (ushr src amt)))
      (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Zero))))
        (x64_shr ty src_ (put_masked_in_imm8_gpr amt ty))))

;; `i128`.

(decl shr_i128 (ValueRegs Gpr) ValueRegs)
(rule (shr_i128 src amt)
      ;; Unpack the lo/hi halves of `src`.
      (let ((src_lo Gpr (value_regs_get_gpr src 0))
            (src_hi Gpr (value_regs_get_gpr src 1))
            ;; Do a shift on each half.
            (lo_shifted Gpr (x64_shr $I64 src_lo amt))
            (hi_shifted Gpr (x64_shr $I64 src_hi amt))
            ;; `src_hi << (64 - amt)` are the bits to carry over from the hi
            ;; into the lo.
            (carry Gpr (x64_shl $I64
                            src_hi
                            (x64_sub $I64
                                 (imm $I64 64)
                                 amt)))
            ;; Share the zero value to reduce register pressure
            (zero Gpr (imm $I64 0))

            ;; Nullify the carry if we are shifting by a multiple of 128.
            (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) amt (RegMemImm.Imm 127))
                                        (cmove $I64 (CC.Z) zero carry)))
            ;; Add the carry bits into the lo.
            (lo_shifted_ Gpr (x64_or $I64 carry_ lo_shifted)))
        ;; Combine the two shifted halves. However, if we are shifting by >= 64
        ;; (modulo 128), then the hi bits are zero and the lo bits are what
        ;; would otherwise be our hi bits.
        (with_flags (x64_test (OperandSize.Size64) amt (RegMemImm.Imm 64))
                    (consumes_flags_concat
                     (cmove $I64 (CC.Z) lo_shifted_ hi_shifted)
                     (cmove $I64 (CC.Z) hi_shifted zero)))))

(rule (lower (has_type $I128 (ushr src amt)))
      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
      ;; amount to the value's bit width.
      (let ((amt_ Gpr (lo_gpr amt)))
        (shr_i128 src amt_)))

;; SSE.

;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
;; with 8x16 `ishl`.
(rule (lower (has_type ty @ $I8X16 (ushr src amt)))
      (let (
            ;; Mask the amount to ensure wrapping behaviour
            (masked_amt RegMemImm (mask_xmm_shift ty amt))
            ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
            ;; correct for half of the lanes; the others must be fixed up with
            ;; the mask below.
            (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
        (sse_and $I8X16
                 unmasked
                 (ushr_i8x16_mask masked_amt))))

;; Get the address of the mask to use when fixing up the lanes that weren't
;; correctly generated by the 16x8 shift.
(decl ushr_i8x16_mask (RegMemImm) SyntheticAmode)

;; When the shift amount is known, we can statically (i.e. at compile time)
;; determine the mask to use and only emit that.
(decl ushr_i8x16_mask_for_const (u32) SyntheticAmode)
(extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const)
(rule (ushr_i8x16_mask (RegMemImm.Imm amt))
      (ushr_i8x16_mask_for_const amt))

;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
;; time) find the correct mask offset in the table. We use `lea` to find the
;; base address of the mask table and then complex addressing to offset to the
;; right mask: `base_address + amt << 4`
(decl ushr_i8x16_mask_table () SyntheticAmode)
(extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table)
(rule (ushr_i8x16_mask (RegMemImm.Reg amt))
      (let ((mask_table SyntheticAmode (ushr_i8x16_mask_table))
            (base_mask_addr Gpr (x64_lea $I64 mask_table))
            (mask_offset Gpr (x64_shl $I64
                                  amt
                                  (imm8_to_imm8_gpr 4))))
        (Amode.ImmRegRegShift 0
                              base_mask_addr
                              mask_offset
                              0
                              (mem_flags_trusted))))

(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
      (ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))

;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.

(rule (lower (has_type ty @ $I16X8 (ushr src amt)))
      (x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

(rule (lower (has_type ty @ $I32X4 (ushr src amt)))
      (x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

(rule (lower (has_type ty @ $I64X2 (ushr src amt)))
      (x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

(decl mask_xmm_shift (Type Value) RegMemImm)
(rule (mask_xmm_shift ty amt)
      (gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(rule 1 (mask_xmm_shift ty (iconst n))
      (RegMemImm.Imm (shift_amount_masked ty n)))

;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

(rule -1 (lower (has_type (fits_in_64 ty) (sshr src amt)))
      (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Sign))))
        (x64_sar ty src_ (put_masked_in_imm8_gpr amt ty))))

;; `i128`.

(decl sar_i128 (ValueRegs Gpr) ValueRegs)
(rule (sar_i128 src amt)
      ;; Unpack the low/high halves of `src`.
      (let ((src_lo Gpr (value_regs_get_gpr src 0))
            (src_hi Gpr (value_regs_get_gpr src 1))
            ;; Do a shift of each half. NB: the low half uses an unsigned shift
            ;; because its MSB is not a sign bit.
            (lo_shifted Gpr (x64_shr $I64 src_lo amt))
            (hi_shifted Gpr (x64_sar $I64 src_hi amt))
            ;; `src_hi << (64 - amt)` are the bits to carry over from the low
            ;; half to the high half.
            (carry Gpr (x64_shl $I64
                            src_hi
                            (x64_sub $I64
                                 (imm $I64 64)
                                 amt)))
            ;; Nullify the carry if we are shifting by a multiple of 128.
            (carry_ Gpr (with_flags_reg (x64_test (OperandSize.Size64) amt (RegMemImm.Imm 127))
                                        (cmove $I64 (CC.Z) (imm $I64 0) carry)))
            ;; Add the carry into the low half.
            (lo_shifted_ Gpr (x64_or $I64 lo_shifted carry_))
            ;; Get all sign bits.
            (sign_bits Gpr (x64_sar $I64 src_hi (imm8_to_imm8_gpr 63))))
        ;; Combine the two shifted halves. However, if we are shifting by >= 64
        ;; (modulo 128), then the hi bits are all sign bits and the lo bits are
        ;; what would otherwise be our hi bits.
        (with_flags (x64_test (OperandSize.Size64) amt (RegMemImm.Imm 64))
                    (consumes_flags_concat
                     (cmove $I64 (CC.Z) lo_shifted_ hi_shifted)
                     (cmove $I64 (CC.Z) hi_shifted sign_bits)))))

(rule (lower (has_type $I128 (sshr src amt)))
      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
      ;; amount to the value's bit width.
      (let ((amt_ Gpr (lo_gpr amt)))
        (sar_i128 src amt_)))

;; SSE.

;; Since the x86 instruction set does not have an 8x16 shift instruction and the
;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not
;; preserve the sign), we use a different approach here: separate the low and
;; high lanes, shift them separately, and merge them into the final result.
;;
;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
;; s15]:
;;
;;   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
;;   shifted_lo.i16x8 = shift each lane of `low`
;;   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
;;   shifted_hi.i16x8 = shift each lane of `high`
;;   result = [s0'', s1'', ..., s15'']
(rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
      (let ((src_ Xmm (put_in_xmm src))
            ;; Mask the amount to ensure wrapping behaviour
            (masked_amt RegMemImm (mask_xmm_shift ty amt))
            ;; In order for `packsswb` later to only use the high byte of each
            ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
            ;; fill in the upper bits appropriately.
            (lo Xmm (x64_punpcklbw src_ src_))
            (hi Xmm (x64_punpckhbw src_ src_))
            (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt))
            (shifted_lo Xmm (x64_psraw lo amt_))
            (shifted_hi Xmm (x64_psraw hi amt_)))
        (x64_packsswb shifted_lo shifted_hi)))

(decl sshr_i8x16_bigger_shift (Type RegMemImm) XmmMemImm)
(rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i))
      (xmm_mem_imm_new (RegMemImm.Imm (u32_add i 8))))
(rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r))
      (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty
                                          r
                                          (RegMemImm.Imm 8)))))
(rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m))
      (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty
                                          (imm ty 8)
                                          rmi))))

;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
;; that if the shift amount is in a register, it is in an XMM register.

(rule (lower (has_type ty @ $I16X8 (sshr src amt)))
      (x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

(rule (lower (has_type ty @ $I32X4 (sshr src amt)))
      (x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))

;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
;; feature sets. To remedy this, a small dance is done with an unsigned right
;; shift plus some extra ops.
(rule 3 (lower (has_type ty @ $I64X2 (sshr src (iconst n))))
        (if-let true (use_avx512vl))
        (if-let true (use_avx512f))
        (x64_vpsraq_imm src (shift_amount_masked ty n)))

(rule 2 (lower (has_type ty @ $I64X2 (sshr src amt)))
        (if-let true (use_avx512vl))
        (if-let true (use_avx512f))
        (let ((masked Gpr (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
          (x64_vpsraq src (x64_movd_to_xmm masked))))

(rule 1 (lower (has_type $I64X2 (sshr src (iconst (u64_from_imm64 (u64_as_u32 amt))))))
        (lower_i64x2_sshr_imm src (u32_and amt 63)))

(rule (lower (has_type $I64X2 (sshr src amt)))
      (lower_i64x2_sshr_gpr src (x64_and $I64 amt (RegMemImm.Imm 63))))

(decl lower_i64x2_sshr_imm (Xmm u32) Xmm)

;; If the shift amount is less than 32 then do an sshr with 32-bit lanes to
;; produce the upper halves of each result, followed by a ushr of 64-bit lanes
;; to produce the lower halves of each result. Interleave results at the end.
(rule 2 (lower_i64x2_sshr_imm vec imm)
        (if-let true (u64_lt imm 32))
        (let (
            (high32 Xmm (x64_psrad vec (xmi_imm imm)))
            (high32 Xmm (x64_pshufd high32 0b11_10_11_01))
            (low32  Xmm (x64_psrlq vec (xmi_imm imm)))
            (low32  Xmm (x64_pshufd low32 0b11_10_10_00))
          )
          (x64_punpckldq low32 high32)))

;; If the shift amount is 32 then the `psrlq` from the above rule can be avoided
(rule 1 (lower_i64x2_sshr_imm vec 32)
        (let (
            (low32  Xmm (x64_pshufd vec 0b11_10_11_01))
            (high32 Xmm (x64_psrad vec (xmi_imm 31)))
            (high32 Xmm (x64_pshufd high32 0b11_10_11_01))
          )
          (x64_punpckldq low32 high32)))

;; Shifts >= 32 use one `psrad` to generate the upper bits and second `psrad` to
;; generate the lower bits. Everything is then woven back together with
;; shuffles.
(rule (lower_i64x2_sshr_imm vec imm)
      (if-let true (u64_lt 32 imm))
      (let (
          (high32 Xmm (x64_psrad vec (xmi_imm 31)))
          (high32 Xmm (x64_pshufd high32 0b11_10_11_01))
          (low32  Xmm (x64_psrad vec (xmi_imm (u32_sub imm 32))))
          (low32  Xmm (x64_pshufd low32 0b11_10_11_01))
        )
        (x64_punpckldq low32 high32)))

;; A variable shift amount is slightly more complicated than the immediate
;; shift amounts from above. The `Gpr` argument is guaranteed to be <= 63 by
;; earlier masking. A `ushr` operation is used with some xor/sub math to
;; generate the sign bits.
(decl lower_i64x2_sshr_gpr (Xmm Gpr) Xmm)
(rule (lower_i64x2_sshr_gpr vec val)
      (let (
          (val                Xmm (x64_movq_to_xmm val))
          (mask               Xmm (flip_high_bit_mask $I64X2))
          (sign_bit_loc       Xmm (x64_psrlq mask val))
          (ushr               Xmm (x64_psrlq vec val))
          (ushr_sign_bit_flip Xmm (x64_pxor sign_bit_loc ushr))
        )
        (x64_psubq ushr_sign_bit_flip sign_bit_loc)))

;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller: we can rely on x86's rotate-amount masking since
;;  we operate on the whole register. For const's we mask the constant.

(rule -1 (lower (has_type (fits_in_64 ty) (rotl src amt)))
        (x64_rotl ty src (put_masked_in_imm8_gpr amt ty)))


;; `i128`.

(rule (lower (has_type $I128 (rotl src amt)))
      (let ((src_ ValueRegs src)
            ;; NB: Only the low bits of `amt` matter since we logically mask the
            ;; rotation amount to the value's bit width.
            (amt_ Gpr (lo_gpr amt)))
        (or_i128 (shl_i128 src_ amt_)
                 (shr_i128 src_ (x64_sub $I64
                                     (imm $I64 128)
                                     amt_)))))

;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller: we can rely on x86's rotate-amount masking since
;;  we operate on the whole register. For const's we mask the constant.

(rule -1 (lower (has_type (fits_in_64 ty) (rotr src amt)))
        (x64_rotr ty src (put_masked_in_imm8_gpr amt ty)))


;; `i128`.

(rule (lower (has_type $I128 (rotr src amt)))
      (let ((src_ ValueRegs src)
            ;; NB: Only the low bits of `amt` matter since we logically mask the
            ;; rotation amount to the value's bit width.
            (amt_ Gpr (lo_gpr amt)))
        (or_i128 (shr_i128 src_ amt_)
                 (shl_i128 src_ (x64_sub $I64
                                     (imm $I64 128)
                                     amt_)))))

;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

(rule -1 (lower (has_type (fits_in_64 ty) (ineg x)))
      (x64_neg ty x))

(rule -2 (lower (has_type $I128 (ineg x)))
      ;; Get the high/low registers for `x`.
      (let ((regs ValueRegs x)
            (lo Gpr (value_regs_get_gpr regs 0))
            (hi Gpr (value_regs_get_gpr regs 1)))
        ;; Do a neg followed by an sub-with-borrow.
        (with_flags (x64_neg_paired $I64 lo)
                    (x64_sbb_paired $I64 (imm $I64 0) hi))))

;; SSE.

(rule (lower (has_type $I8X16 (ineg x)))
      (x64_psubb (imm $I8X16 0) x))

(rule (lower (has_type $I16X8 (ineg x)))
      (x64_psubw (imm $I16X8 0) x))

(rule (lower (has_type $I32X4 (ineg x)))
      (x64_psubd (imm $I32X4 0) x))

(rule (lower (has_type $I64X2 (ineg x)))
      (x64_psubq (imm $I64X2 0) x))

;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16)
                       (avg_round x y)))
      (x64_pavgb x y))

(rule (lower (has_type (multi_lane 16 8)
                       (avg_round x y)))
      (x64_pavgw x y))

;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

;; 8-bit base case, needs a special instruction encoding and additionally
;; move sinkable loads to the right.
(rule -8 (lower (has_type $I8 (imul x y))) (x64_mul8 false x y))
(rule -7 (lower (has_type $I8 (imul (sinkable_load x) y))) (x64_mul8 false y x))

;; 16-to-64-bit base cases, same as above by moving sinkable loads to the right.
(rule -6 (lower (has_type (ty_int_ref_16_to_64 ty) (imul x y)))
         (x64_imul ty x y))
(rule -5 (lower (has_type (ty_int_ref_16_to_64 ty) (imul (sinkable_load x) y)))
         (x64_imul ty y x))

;; lift out constants to use 3-operand form
(rule -4 (lower (has_type (ty_int_ref_16_to_64 ty) (imul x (i32_from_iconst y))))
         (x64_imul_imm ty x y))
(rule -3 (lower (has_type (ty_int_ref_16_to_64 ty) (imul (i32_from_iconst x) y)))
         (x64_imul_imm ty y x))

;; Special case widening multiplication from 8-to-16-bits with a single
;; instruction since the 8-bit-multiply places both the high and low halves in
;; the same register
(rule -2 (lower (has_type $I16 (imul (sextend x) (sextend y))))
  (x64_mul8 true x y))
(rule -2 (lower (has_type $I16 (imul (uextend x) (uextend y))))
  (x64_mul8 false x y))

;; `i128`.

(rule 2 (lower (has_type $I128 (imul x y)))
      (let ((x_regs ValueRegs x)
            (y_regs ValueRegs y))
        (imul128
          (value_regs_get_gpr x_regs 0)
          (value_regs_get_gpr x_regs 1)
          (value_regs_get_gpr y_regs 0)
          (value_regs_get_gpr y_regs 1))))

(rule 4 (lower (has_type $I128 (imul (iconcat x_lo x_hi) (iconcat y_lo y_hi))))
        (imul128 x_lo x_hi y_lo y_hi))

;; Helper for lowering 128-bit multiplication with the 64-bit halves of the
;; lhs/rhs already split. The first two arguments are lo/hi for the lhs and the
;; second two are lo/hi for the rhs.
;;
;; mul:
;;   dst_lo = lhs_lo * rhs_lo
;;   dst_hi = umulhi(lhs_lo, rhs_lo) +
;;            lhs_lo * rhs_hi +
;;            lhs_hi * rhs_lo
;;
;; so we emit:
;;   lo_hi = mul x_lo, y_hi
;;   hi_lo = mul x_hi, y_lo
;;   hilo_hilo = add lo_hi, hi_lo
;;   dst_lo:hi_lolo = mulhi_u x_lo, y_lo
;;   dst_hi = add hilo_hilo, hi_lolo
;;   return (dst_lo, dst_hi)
(decl imul128 (Gpr Gpr GprMem GprMem) ValueRegs)
(rule (imul128 x_lo x_hi y_lo y_hi)
      ;; Put `x` into registers and unpack its hi/lo halves.
      (let (
            ;; lo_hi = mul x_lo, y_hi
            (lo_hi Gpr (x64_imul $I64 x_lo y_hi))
            ;; hi_lo = mul x_hi, y_lo
            (hi_lo Gpr (x64_imul $I64 x_hi y_lo))
            ;; hilo_hilo = add lo_hi, hi_lo
            (hilo_hilo Gpr (x64_add $I64 lo_hi hi_lo))
            ;; dst_lo:hi_lolo = x64_mul x_lo, y_lo
            (mul_regs ValueRegs (x64_mul $I64 false x_lo y_lo))
            (dst_lo Gpr (value_regs_get_gpr mul_regs 0))
            (hi_lolo Gpr (value_regs_get_gpr mul_regs 1))
            ;; dst_hi = add hilo_hilo, hi_lolo
            (dst_hi Gpr (x64_add $I64 hilo_hilo hi_lolo)))
        (value_gprs dst_lo dst_hi)))

;; The `mul` and `imul` instructions on x64 are defined as taking 64-bit
;; operands and producing a 128-bit result, which exactly matches the semantics
;; of widening 64-bit inputs to 128-bit and then multiplying them. That means
;; that these cases can get some some simpler codegen.
(rule 5 (lower (has_type $I128 (imul (uextend x @ (value_type $I64))
                                     (uextend y @ (value_type $I64)))))
        (x64_mul $I64 false x y))
(rule 5 (lower (has_type $I128 (imul (sextend x @ (value_type $I64))
                                     (sextend y @ (value_type $I64)))))
        (x64_mul $I64 true x y))

;; SSE.

;; (No i8x16 multiply.)

(rule (lower (has_type (multi_lane 16 8) (imul x y)))
      (x64_pmullw x y))

(rule (lower (has_type (multi_lane 32 4) (imul x y)))
      (if-let true (use_sse41))
      (x64_pmulld x y))

;; Without `pmulld` the `pmuludq` instruction is used instead which performs
;; 32-bit multiplication storing the 64-bit result. The 64-bit result is
;; truncated to 32-bits and everything else is woven into place.
(rule -1 (lower (has_type (multi_lane 32 4) (imul x y)))
         (let (
            (x Xmm x)
            (y Xmm y)
            (x_hi Xmm (x64_pshufd x 0b00_11_00_01))
            (y_hi Xmm (x64_pshufd y 0b00_11_00_01))
            (mul_lo Xmm (x64_pshufd (x64_pmuludq x y)       0b00_00_10_00))
            (mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00))
          )
          (x64_punpckldq mul_lo mul_hi)))

;; With AVX-512 we can implement `i64x2` multiplication with a single
;; instruction.
(rule 3 (lower (has_type (multi_lane 64 2) (imul x y)))
      (if-let true (use_avx512vl))
      (if-let true (use_avx512dq))
      (x64_vpmullq x y))

;; Otherwise, for i64x2 multiplication we describe a lane A as being composed of
;; a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
;; multiplication can then be written as:
;;
;;    Ah Al
;; *  Bh Bl
;;    -----
;;    Al * Bl
;; + (Ah * Bl) << 32
;; + (Al * Bh) << 32
;;
;; So for each lane we will compute:
;;
;;   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
;;
;; Note, the algorithm will use `pmuludq` which operates directly on the lower
;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
;; the lane of the destination. For this reason we don't need shifts to isolate
;; the lower 32-bits, however, we will need to use shifts to isolate the high
;; 32-bits when doing calculations, i.e., `Ah == A >> 32`.
(rule (lower (has_type (multi_lane 64 2)
                       (imul a b)))
      (let ((a0 Xmm a)
            (b0 Xmm b)
            ;; a_hi = A >> 32
            (a_hi Xmm (x64_psrlq a0 (xmi_imm 32)))
            ;; ah_bl = Ah * Bl
            (ah_bl Xmm (x64_pmuludq a_hi b0))
            ;; b_hi = B >> 32
            (b_hi Xmm (x64_psrlq b0 (xmi_imm 32)))
            ;; al_bh = Al * Bh
            (al_bh Xmm (x64_pmuludq a0 b_hi))
            ;; aa_bb = ah_bl + al_bh
            (aa_bb Xmm (x64_paddq ah_bl al_bh))
            ;; aa_bb_shifted = aa_bb << 32
            (aa_bb_shifted Xmm (x64_psllq aa_bb (xmi_imm 32)))
            ;; al_bl = Al * Bl
            (al_bl Xmm (x64_pmuludq a0 b0)))
        ;; al_bl + aa_bb_shifted
        (x64_paddq al_bl aa_bb_shifted)))

;; Special case for `i32x4.extmul_high_i16x8_s`.
(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (swiden_high (and (value_type (multi_lane 16 8))
                                               x))
                             (swiden_high (and (value_type (multi_lane 16 8))
                                               y)))))
      (let ((x2 Xmm x)
            (y2 Xmm y)
            (lo Xmm (x64_pmullw x2 y2))
            (hi Xmm (x64_pmulhw x2 y2)))
        (x64_punpckhwd lo hi)))

;; Special case for `i64x2.extmul_high_i32x4_s`.
(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (swiden_high (and (value_type (multi_lane 32 4))
                                               x))
                             (swiden_high (and (value_type (multi_lane 32 4))
                                               y)))))
      (if-let true (use_sse41))
      (let ((x2 Xmm (x64_pshufd x 0xFA))
            (y2 Xmm (x64_pshufd y 0xFA)))
        (x64_pmuldq x2 y2)))

;; Special case for `i32x4.extmul_low_i16x8_s`.
(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (swiden_low (and (value_type (multi_lane 16 8))
                                              x))
                             (swiden_low (and (value_type (multi_lane 16 8))
                                              y)))))
      (let ((x2 Xmm x)
            (y2 Xmm y)
            (lo Xmm (x64_pmullw x2 y2))
            (hi Xmm (x64_pmulhw x2 y2)))
        (x64_punpcklwd lo hi)))

;; Special case for `i64x2.extmul_low_i32x4_s`.
(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (swiden_low (and (value_type (multi_lane 32 4))
                                              x))
                             (swiden_low (and (value_type (multi_lane 32 4))
                                              y)))))
      (if-let true (use_sse41))
      (let ((x2 Xmm (x64_pshufd x 0x50))
            (y2 Xmm (x64_pshufd y 0x50)))
        (x64_pmuldq x2 y2)))

;; Special case for `i32x4.extmul_high_i16x8_u`.
(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (uwiden_high (and (value_type (multi_lane 16 8))
                                               x))
                             (uwiden_high (and (value_type (multi_lane 16 8))
                                               y)))))
      (let ((x2 Xmm x)
            (y2 Xmm y)
            (lo Xmm (x64_pmullw x2 y2))
            (hi Xmm (x64_pmulhuw x2 y2)))
        (x64_punpckhwd lo hi)))

;; Special case for `i64x2.extmul_high_i32x4_u`.
(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (uwiden_high (and (value_type (multi_lane 32 4))
                                               x))
                             (uwiden_high (and (value_type (multi_lane 32 4))
                                               y)))))
      (let ((x2 Xmm (x64_pshufd x 0xFA))
            (y2 Xmm (x64_pshufd y 0xFA)))
        (x64_pmuludq x2 y2)))

;; Special case for `i32x4.extmul_low_i16x8_u`.
(rule 1 (lower (has_type (multi_lane 32 4)
                       (imul (uwiden_low (and (value_type (multi_lane 16 8))
                                              x))
                             (uwiden_low (and (value_type (multi_lane 16 8))
                                              y)))))
      (let ((x2 Xmm x)
            (y2 Xmm y)
            (lo Xmm (x64_pmullw x2 y2))
            (hi Xmm (x64_pmulhuw x2 y2)))
        (x64_punpcklwd lo hi)))

;; Special case for `i64x2.extmul_low_i32x4_u`.
(rule 1 (lower (has_type (multi_lane 64 2)
                       (imul (uwiden_low (and (value_type (multi_lane 32 4))
                                              x))
                             (uwiden_low (and (value_type (multi_lane 32 4))
                                              y)))))
      (let ((x2 Xmm (x64_pshufd x 0x50))
            (y2 Xmm (x64_pshufd y 0x50)))
        (x64_pmuludq x2 y2)))

;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type $I8X16 (iabs x)))
        (if-let true (use_ssse3))
        (x64_pabsb x))

;; Note the use of `pminub` with signed inputs will produce the positive signed
;; result which is what is desired here. The `pmaxub` isn't available until
;; SSE4.1 in which case the single-instruction above lowering would apply.
(rule (lower (has_type $I8X16 (iabs x)))
      (let (
          (x Xmm x)
          (negated Xmm (x64_psubb (xmm_zero $I8X16) x))
        )
        (x64_pminub x negated)))

(rule 1 (lower (has_type $I16X8 (iabs x)))
        (if-let true (use_ssse3))
        (x64_pabsw x))

(rule (lower (has_type $I16X8 (iabs x)))
      (let (
          (x Xmm x)
          (negated Xmm (x64_psubw (xmm_zero $I16X8) x))
        )
        (x64_pmaxsw x negated)))

(rule 1 (lower (has_type $I32X4 (iabs x)))
        (if-let true (use_ssse3))
        (x64_pabsd x))

;; Generate a `negative_mask` which is either numerically -1 or 0 depending on
;; if the lane is negative. If the lane is positive then the xor operation
;; won't change the lane but otherwise it'll bit-flip everything. By then
;; subtracting the mask this subtracts 0 for positive lanes (does nothing) or
;; ends up adding one for negative lanes. This means that for a negative lane
;; `x` the result is `!x + 1` which is the result of negating it.
(rule (lower (has_type $I32X4 (iabs x)))
      (let (
          (x Xmm x)
          (negative_mask Xmm (x64_psrad x (xmi_imm 31)))
          (flipped_if_negative Xmm (x64_pxor x negative_mask))
        )
        (x64_psubd flipped_if_negative negative_mask)))

;; When AVX512 is available, we can use a single `vpabsq` instruction.
(rule 2 (lower (has_type $I64X2 (iabs x)))
      (if-let true (use_avx512vl))
      (if-let true (use_avx512f))
      (x64_vpabsq x))

;; Otherwise, we use a separate register, `neg`, to contain the results of `0 -
;; x` and then blend in those results with `blendvpd` if the MSB of `neg` was
;; set to 1 (i.e. if `neg` was negative or, conversely, if `x` was originally
;; positive).
(rule 1 (lower (has_type $I64X2 (iabs x)))
        (if-let true (use_sse41))
        (let ((rx Xmm x)
              (neg Xmm (x64_psubq (imm $I64X2 0) rx)))
          (x64_blendvpd neg rx neg)))

;; and if `blendvpd` isn't available then perform a shift/shuffle to generate a
;; mask of which lanes are negative, followed by flipping bits/sub to make both
;; positive.
(rule (lower (has_type $I64X2 (iabs x)))
      (let ((x Xmm x)
            (signs Xmm (x64_psrad x (RegMemImm.Imm 31)))
            (signs Xmm (x64_pshufd signs 0b11_11_01_01))
            (xor_if_negative Xmm (x64_pxor x signs)))
        (x64_psubq xor_if_negative signs)))

;; `i64` and smaller.

(rule -1 (lower (has_type (fits_in_64 ty) (iabs x)))
      (let ((src Gpr x)
            (neg ProducesFlags (x64_neg_paired ty src))
            ;; Manually extract the result from the neg, then ignore
            ;; it below, since we need to pass it into the cmove
            ;; before we pass the cmove to with_flags_reg.
            (neg_result Gpr (produces_flags_get_reg neg))
            ;; When the neg instruction sets the sign flag,
            ;; takes the original (non-negative) value.
            (cmove ConsumesFlags (cmove ty (CC.S) src neg_result)))
        (with_flags_reg (produces_flags_ignore neg) cmove)))

;; `i128`. Negate the low bits, `adc` to the higher bits, then negate high bits.
(rule (lower (has_type $I128 (iabs x)))
      ;; Get the high/low registers for `x`.
      (let ((x_regs ValueRegs x)
            (x_lo Gpr (value_regs_get_gpr x_regs 0))
            (x_hi Gpr (value_regs_get_gpr x_regs 1))
            ; negate low bits, then add 0 with carry to high bits.
            (neg_lo ProducesFlags (x64_neg_paired $I64 x_lo))
            (adc_hi ConsumesFlags (x64_adc_paired $I64 x_hi (imm $I64 0)))
            (neg_adc_vals ValueRegs (with_flags neg_lo adc_hi))
            ; negate high bits.
            (neg_hi ProducesFlags (x64_neg_paired $I64 (value_regs_get neg_adc_vals 1)))
            (neg_hi_flag_only ProducesFlags (produces_flags_ignore neg_hi))
            ; cmove based on sign flag from hi negation.
            (cmove_lo ConsumesFlags (cmove $I64 (CC.S) x_lo
                                     (value_regs_get neg_adc_vals 0)))
            (cmove_hi ConsumesFlags (cmove $I64 (CC.S) x_hi
                                     (produces_flags_get_reg neg_hi)))
            (cmoves ConsumesFlags (consumes_flags_concat cmove_lo cmove_hi)))
        (with_flags neg_hi_flag_only cmoves)))

;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fabs x)))
      (x64_andps x (imm $F32 0x7fffffff)))

(rule (lower (has_type $F64 (fabs x)))
      (x64_andpd x (imm $F64 0x7fffffffffffffff)))

;; Special case for `f32x4.abs`.
(rule (lower (has_type $F32X4 (fabs x)))
      (x64_andps x
             (x64_psrld (vector_all_ones) (xmi_imm 1))))

;; Special case for `f64x2.abs`.
(rule (lower (has_type $F64X2 (fabs x)))
      (x64_andpd x
             (x64_psrlq (vector_all_ones) (xmi_imm 1))))

;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fneg x)))
      (x64_xorps x (imm $F32 0x80000000)))

(rule (lower (has_type $F64 (fneg x)))
      (x64_xorpd x (imm $F64 0x8000000000000000)))

(rule (lower (has_type $F32X4 (fneg x)))
      (x64_xorps x
             (x64_pslld (vector_all_ones) (xmi_imm 31))))

(rule (lower (has_type $F64X2 (fneg x)))
      (x64_xorpd x
             (x64_psllq (vector_all_ones) (xmi_imm 63))))

;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(decl lower_bmask (Type Type ValueRegs) ValueRegs)

;; Values that fit in a register
;;
;; Use the neg instruction on the input which sets the CF (carry) flag
;; to 0 if the input is 0 or 1 otherwise.
;; We then subtract the output register with itself, which always gives a 0,
;; however use the carry flag from the previous negate to generate a -1 if it
;; was nonzero.
;;
;; neg in_reg
;; sbb out_reg, out_reg
(rule 0
      (lower_bmask (fits_in_64 out_ty) (fits_in_64 in_ty) val)
      (let ((reg Gpr (value_regs_get_gpr val 0))
            (out ValueRegs (with_flags
                  (x64_neg_paired in_ty reg)
                  (x64_sbb_paired out_ty reg reg))))
        ;; Extract only the output of the sbb instruction
        (value_reg (value_regs_get out 1))))


;; If the input type is I128 we can `or` the registers, and recurse to the general case.
(rule 1
      (lower_bmask (fits_in_64 out_ty) $I128 val)
      (let ((lo Gpr (value_regs_get_gpr val 0))
            (hi Gpr (value_regs_get_gpr val 1))
            (mixed Gpr (x64_or $I64 lo hi)))
        (lower_bmask out_ty $I64 (value_reg mixed))))

;; If the output type is I128 we just duplicate the result of the I64 lowering
(rule 2
      (lower_bmask $I128 in_ty val)
      (let ((res ValueRegs (lower_bmask $I64 in_ty val))
            (res Gpr (value_regs_get_gpr res 0)))
        (value_regs res res)))


;; Call the lower_bmask rule that does all the procssing
(rule (lower (has_type out_ty (bmask x @ (value_type in_ty))))
      (lower_bmask out_ty in_ty x))

;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

(rule -2 (lower (has_type ty (bnot x)))
      (if (ty_int_ref_scalar_64 ty))
      (x64_not ty x))


;; `i128`.

(decl i128_not (Value) ValueRegs)
(rule (i128_not x)
      (let ((x_regs ValueRegs x)
            (x_lo Gpr (value_regs_get_gpr x_regs 0))
            (x_hi Gpr (value_regs_get_gpr x_regs 1)))
        (value_gprs (x64_not $I64 x_lo)
                    (x64_not $I64 x_hi))))

(rule (lower (has_type $I128 (bnot x)))
      (i128_not x))

;; f32 and f64

(rule -3 (lower (has_type (ty_scalar_float ty) (bnot x)))
      (x64_xor_vector ty x (vector_all_ones)))

;; Special case for vector-types where bit-negation is an xor against an
;; all-one value
(rule -1 (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x)))
      (x64_xor_vector ty x (vector_all_ones)))

;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty @ (multi_lane _bits _lanes)
                       (bitselect condition
                                  if_true
                                  if_false)))
      ;; a = and if_true, condition
      ;; b = and_not condition, if_false
      ;; or b, a
      (let ((cond_xmm Xmm condition)
            (a Xmm (sse_and ty if_true cond_xmm))
            (b Xmm (sse_and_not ty cond_xmm if_false)))
        (sse_or ty b a)))

;; If every byte of the condition is guaranteed to be all ones or all zeroes,
;; we can use x64_blend.
(rule 1 (lower (has_type ty @ (multi_lane _bits _lanes)
                         (bitselect condition
                                    if_true
                                    if_false)))
      (if-let true (use_sse41))
      (if (all_ones_or_all_zeros condition))
      (x64_pblendvb if_false if_true condition))

(decl pure partial all_ones_or_all_zeros (Value) bool)
(rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) true)
(rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) true)
(rule (all_ones_or_all_zeros (and (bitcast _ (fcmp _ _ _)) (value_type (multi_lane _ _)))) true)
(rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) true)

(decl pure vconst_all_ones_or_all_zeros () Constant)
(extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros)

;; Specializations for floating-pointer compares to generate a `minp*` or a
;; `maxp*` instruction. These are equivalent to the wasm `f32x4.{pmin,pmax}`
;; instructions and how they're lowered into CLIF. Note the careful ordering
;; of all the operands here to ensure that the input CLIF matched is implemented
;; by the corresponding x64 instruction.
(rule 2 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y)))
        (x64_minps x y))
(rule 2 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y)))
        (x64_minpd x y))

(rule 3 (lower (has_type $F32X4 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y)))
        (x64_maxps x y))
(rule 3 (lower (has_type $F64X2 (bitselect (bitcast _ (fcmp (FloatCC.LessThan) y x)) x y)))
        (x64_maxpd x y))

;; Scalar rules

(rule 3 (lower (has_type $I128 (bitselect c t f)))
      (let ((a ValueRegs (and_i128 c t))
            (b ValueRegs (and_i128 (i128_not c) f)))
        (or_i128 a b)))

(rule 4 (lower (has_type (ty_int_ref_scalar_64 ty) (bitselect c t f)))
      (let ((a Gpr (x64_and ty c t))
            (b Gpr (x64_and ty (x64_not ty c) f)))
        (x64_or ty a b)))

(rule 5 (lower (has_type (ty_scalar_float ty) (bitselect c t f)))
      (let ((a Xmm (sse_and ty c t))
            (c_neg Xmm (x64_xor_vector ty c (vector_all_ones)))
            (b Xmm (sse_and ty c_neg f)))
        (sse_or ty a b)))

;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8X16
                       (x86_blendv condition if_true if_false)))
      (if-let true (use_sse41))
      (x64_pblendvb if_false if_true condition))

(rule (lower (has_type $I32X4
                       (x86_blendv condition if_true if_false)))
      (if-let true (use_sse41))
      (x64_blendvps if_false if_true condition))

(rule (lower (has_type $I64X2
                       (x86_blendv condition if_true if_false)))
      (if-let true (use_sse41))
      (x64_blendvpd if_false if_true condition))

;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (insertlane vec @ (value_type $I8X16) val (u8_from_uimm8 idx)))
  (if-let true (use_sse41))
  (x64_pinsrb vec val idx))
(rule 2 (lower (insertlane vec @ (value_type $I8X16) (sinkable_load_exact val) (u8_from_uimm8 idx)))
  (if-let true (use_sse41))
  (x64_pinsrb vec val idx))

;; This lowering is particularly unoptimized and is mostly just here to work
;; rather than here to be fast. Requiring SSE 4.1 for the above lowering isn't
;; the end of the world hopefully as that's a pretty old instruction set, so
;; this is the "simplest" version that works on SSE2 for now.
;;
;; This lowering masks the original vector with a constant with all 1s except
;; for the "hole" where this value will get placed into, meaning the desired
;; lane is guaranteed as all 0s. Next the `val` is shuffled into this hole with
;; a few operations:
;;
;;  1. The `val` is zero-extended to 32-bits to guarantee the lower 32-bits
;;     are all defined.
;;  2. An arithmetic shift-left is used with the low two bits of `n`, the
;;     desired lane, to move the value into the right position within the 32-bit
;;     register value.
;;  3. The 32-bit register is moved with `movd` into an XMM register
;;  4. The XMM register, where all lanes are 0 except for the first lane which
;;     has the shifted value, is then shuffled with `pshufd` to move the
;;     shifted value to the correct and final lane. This uses the upper two
;;     bits of `n` to index the i32x4 lane that we're targeting.
;;
;; This all, laboriously, gets the `val` into the desired lane so it's then
;; `por`'d with the original vec-with-a-hole to produce the final result of the
;; insertion.
(rule (lower (insertlane vec @ (value_type $I8X16) val (u8_from_uimm8 n)))
      (let ((vec_with_hole Xmm (x64_pand vec (insert_i8x16_lane_hole n)))
            (val Gpr (x64_movzx (ExtMode.BL) val))
            (val Gpr (x64_shl $I32 val (Imm8Reg.Imm8 (u8_shl (u8_and n 3) 3))))
            (val Xmm (x64_movd_to_xmm val))
            (val_at_hole Xmm (x64_pshufd val (insert_i8x16_lane_pshufd_imm (u8_shr n 2)))))
        (x64_por vec_with_hole val_at_hole)))

(decl insert_i8x16_lane_hole (u8) VCodeConstant)
(extern constructor insert_i8x16_lane_hole insert_i8x16_lane_hole)
(decl insert_i8x16_lane_pshufd_imm (u8) u8)
(rule (insert_i8x16_lane_pshufd_imm 0) 0b01_01_01_00)
(rule (insert_i8x16_lane_pshufd_imm 1) 0b01_01_00_01)
(rule (insert_i8x16_lane_pshufd_imm 2) 0b01_00_01_01)
(rule (insert_i8x16_lane_pshufd_imm 3) 0b00_01_01_01)


;; i16x8.replace_lane
(rule (lower (insertlane vec @ (value_type $I16X8) val (u8_from_uimm8 idx)))
  (x64_pinsrw vec val idx))
(rule 1 (lower (insertlane vec @ (value_type $I16X8) (sinkable_load_exact val) (u8_from_uimm8 idx)))
  (x64_pinsrw vec val idx))

;; i32x4.replace_lane
(rule 1 (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 idx)))
        (if-let true (use_sse41))
        (x64_pinsrd vec val idx))

(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 0)))
  (x64_movss_regmove vec (x64_movd_to_xmm val)))

;; tmp    = [ vec[1] vec[0] val[1] val[0] ]
;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 1)))
      (let ((val Xmm (x64_movd_to_xmm val))
            (vec Xmm vec))
        (x64_shufps (x64_punpcklqdq val vec) vec 0b11_10_00_10)))

;; tmp    = [ vec[0] vec[3] val[0] val[0] ]
;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 2)))
      (let ((val Xmm (x64_movd_to_xmm val))
            (vec Xmm vec))
        (x64_shufps vec (x64_shufps val vec 0b00_11_00_00) 0b10_00_01_00)))

;; tmp    = [ vec[3] vec[2] val[1] val[0] ]
;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
(rule (lower (insertlane vec @ (value_type $I32X4) val (u8_from_uimm8 3)))
      (let ((val Xmm (x64_movd_to_xmm val))
            (vec Xmm vec))
        (x64_shufps vec (x64_shufps val vec 0b11_10_01_00) 0b00_10_01_00)))

;; i64x2.replace_lane
(rule 1 (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 idx)))
        (if-let true (use_sse41))
        (x64_pinsrq vec val idx))
(rule (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 0)))
      (x64_movsd_regmove vec (x64_movq_to_xmm val)))
(rule (lower (insertlane vec @ (value_type $I64X2) val (u8_from_uimm8 1)))
      (x64_punpcklqdq vec (x64_movq_to_xmm val)))

;; (i64x2.replace_lane 1) with a splat as source for lane 0 -- we can elide
;; the splat and just do a move. This turns out to be a common pattern when
;; constructing an i64x2 out of two i64s.
(rule 3 (lower (insertlane (has_type $I64X2 (splat lane0))
                           lane1
                           (u8_from_uimm8 1)))
        (if-let true (use_sse41))
        (x64_pinsrq (bitcast_gpr_to_xmm 64 lane0) lane1 1))

(rule 1 (lower (insertlane vec @ (value_type $F32X4) (sinkable_load val) (u8_from_uimm8 idx)))
  (if-let true (use_sse41))
  (x64_insertps vec val (sse_insertps_lane_imm idx)))
(rule (lower (insertlane vec @ (value_type $F32X4) val (u8_from_uimm8 idx)))
  (f32x4_insertlane vec val idx))

;; Helper function used below for `insertlane` but also here for other
(decl f32x4_insertlane (Xmm Xmm u8) Xmm)

;; f32x4.replace_lane
(rule 1 (f32x4_insertlane vec val idx)
        (if-let true (use_sse41))
        (x64_insertps vec val (sse_insertps_lane_imm idx)))

;; External rust code used to calculate the immediate value to `insertps`.
(decl sse_insertps_lane_imm (u8) u8)
(extern constructor sse_insertps_lane_imm sse_insertps_lane_imm)

;; f32x4.replace_lane 0
(rule (f32x4_insertlane vec val 0)
      (x64_movss_regmove vec val))

;; f32x4.replace_lane 1
;; tmp    = [ vec[1] vec[0] val[1] val[0] ]
;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
(rule (f32x4_insertlane vec val 1)
      (let ((tmp Xmm (x64_movlhps val vec)))
        (x64_shufps tmp vec 0b11_10_00_10)))

;; f32x4.replace_lane 2
;; tmp    = [ vec[0] vec[3] val[0] val[0] ]
;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
(rule (f32x4_insertlane vec val 2)
      (let ((tmp Xmm (x64_shufps val vec 0b00_11_00_00)))
        (x64_shufps vec tmp 0b10_00_01_00)))

;; f32x4.replace_lane 3
;; tmp    = [ vec[3] vec[2] val[1] val[0] ]
;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
(rule (f32x4_insertlane vec val 3)
      (let ((tmp Xmm (x64_shufps val vec 0b11_10_01_00)))
        (x64_shufps vec tmp 0b00_10_01_00)))

;; f64x2.replace_lane 0
;;
;; Here the `movsd` instruction is used specifically to specialize moving
;; into the fist lane where unlike above cases we're not using the lane
;; immediate as an immediate to the instruction itself.
(rule (lower (insertlane vec @ (value_type $F64X2) val (u8_from_uimm8 0)))
      (x64_movsd_regmove vec val))

;; f64x2.replace_lane 1
;;
;; Here the `movlhps` instruction is used specifically to specialize moving
;; into the second lane where unlike above cases we're not using the lane
;; immediate as an immediate to the instruction itself.
(rule (lower (insertlane vec @ (value_type $F64X2) val (u8_from_uimm8 1)))
      (x64_movlhps vec val))

;;;; Rules for `smin`, `smax`, `umin`, `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

(decl cmp_and_choose (Type CC Value Value) ValueRegs)
(rule (cmp_and_choose (fits_in_64 ty) cc x y)
      (let ((size OperandSize (raw_operand_size_of_type ty))
            ;; We need to put x and y in registers explicitly because
            ;; we use the values more than once. Hence, even if these
            ;; are "unique uses" at the CLIF level and would otherwise
            ;; allow for load-op merging, here we cannot do that.
            (x_reg Reg x)
            (y_reg Reg y))
        (with_flags_reg (x64_cmp size y_reg x_reg)
                        (cmove ty cc y_reg x_reg))))

(rule -1 (lower (has_type (fits_in_64 ty) (umin x y)))
      (cmp_and_choose ty (CC.B) x y))

(rule -1 (lower (has_type (fits_in_64 ty) (umax x y)))
      (cmp_and_choose ty (CC.NB) x y))

(rule -1 (lower (has_type (fits_in_64 ty) (smin x y)))
      (cmp_and_choose ty (CC.L) x y))

(rule -1 (lower (has_type (fits_in_64 ty) (smax x y)))
      (cmp_and_choose ty (CC.NL) x y))

;; SSE helpers for determining if single-instruction lowerings are available.

(decl pure has_pmins (Type) bool)
(rule 1 (has_pmins $I16X8) true)
(rule 1 (has_pmins $I64X2) false)
(rule (has_pmins _) (use_sse41))

(decl pure has_pmaxs (Type) bool)
(rule 1 (has_pmaxs $I16X8) true)
(rule 1 (has_pmaxs $I64X2) false)
(rule (has_pmaxs _) (use_sse41))

(decl pure has_pmaxu (Type) bool)
(rule 1 (has_pmaxu $I8X16) true)
(rule 1 (has_pmaxu $I64X2) false)
(rule (has_pmaxu _) (use_sse41))

(decl pure has_pminu (Type) bool)
(rule 1 (has_pminu $I8X16) true)
(rule 1 (has_pminu $I64X2) false)
(rule (has_pminu _) (use_sse41))

;; SSE `smax`.

(rule (lower (has_type (ty_vec128 ty) (smax x y)))
      (lower_vec_smax ty x y))

(decl lower_vec_smax (Type Xmm Xmm) Xmm)
(rule 1 (lower_vec_smax ty x y)
        (if-let true (has_pmaxs ty))
        (x64_pmaxs ty x y))

(rule (lower_vec_smax ty x y)
      (let (
          (x Xmm x)
          (y Xmm y)
          (cmp Xmm (x64_pcmpgt ty x y))
          (x_is_max Xmm (x64_pand cmp x))
          (y_is_max Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_max y_is_max)))

;; SSE `smin`.

(rule 1 (lower (has_type (ty_vec128 ty) (smin x y)))
        (if-let true (has_pmins ty))
        (x64_pmins ty x y))

(rule (lower (has_type (ty_vec128 ty) (smin x y)))
      (let (
          (x Xmm x)
          (y Xmm y)
          (cmp Xmm (x64_pcmpgt ty y x))
          (x_is_min Xmm (x64_pand cmp x))
          (y_is_min Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_min y_is_min)))

;; SSE `umax`.

(rule 2 (lower (has_type (ty_vec128 ty) (umax x y)))
        (if-let true (has_pmaxu ty))
        (x64_pmaxu ty x y))

;; If y < x then the saturating subtraction will be zero, otherwise when added
;; back to x it'll return y.
(rule 1 (lower (has_type $I16X8 (umax x y)))
        (let ((x Xmm x))
          (x64_paddw x (x64_psubusw y x))))

;; Flip the upper bits of each lane so the signed comparison has the same
;; result as a signed comparison, and then select the results with the output
;; mask. See `pcmpgt` lowering for info on flipping the upper bit.
(rule (lower (has_type (ty_vec128 ty) (umax x y)))
      (let (
          (x Xmm x)
          (y Xmm y)
          (mask Xmm (flip_high_bit_mask ty))
          (x_masked Xmm (x64_pxor x mask))
          (y_masked Xmm (x64_pxor y mask))
          (cmp Xmm (x64_pcmpgt ty x_masked y_masked))
          (x_is_max Xmm (x64_pand cmp x))
          (y_is_max Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_max y_is_max)))

(decl flip_high_bit_mask (Type) Xmm)
(rule (flip_high_bit_mask $I16X8)
      (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)))
(rule (flip_high_bit_mask $I32X4)
      (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000)))
(rule (flip_high_bit_mask $I64X2)
      (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000)))

;; SSE `umin`.

(rule 2 (lower (has_type (ty_vec128 ty) (umin x y)))
        (if-let true (has_pminu ty))
        (x64_pminu ty x y))

;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then
;; the saturated result, when subtracted again, will go back to `y`.
(rule 1 (lower (has_type $I16X8 (umin x y)))
        (let ((x Xmm x))
          (x64_psubw x (x64_psubusw x y))))

;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit.
(rule (lower (has_type (ty_vec128 ty) (umin x y)))
      (let (
          (x Xmm x)
          (y Xmm y)
          (mask Xmm (flip_high_bit_mask ty))
          (x_masked Xmm (x64_pxor x mask))
          (y_masked Xmm (x64_pxor y mask))
          (cmp Xmm (x64_pcmpgt ty y_masked x_masked))
          (x_is_max Xmm (x64_pand cmp x))
          (y_is_max Xmm (x64_pandn cmp y))
        )
        (x64_por x_is_max y_is_max)))

;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (trap code))
      (side_effect (x64_ud2 code)))

;;;; Rules for `trapz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (trapz val code))
        (side_effect (trap_if_val (ZeroCond.Zero) val code)))

(rule 1 (lower (trapz (icmp cc a b) code))
        (side_effect (trap_if_icmp (emit_cmp (intcc_complement cc) a b) code)))

;;;; Rules for `trapnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (trapnz val code))
        (side_effect (trap_if_val (ZeroCond.NonZero) val code)))

(rule 1 (lower (trapnz (icmp cc a b) code))
        (side_effect (trap_if_icmp (emit_cmp cc a b) code)))

;;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap a b tc)))
      (with_flags
        (x64_add_with_flags_paired ty a b)
        (trap_if (CC.B) tc)))

;; Handle lhs immediates/sinkable loads in addition to the automatic rhs
;; handling of above.

(rule 1 (lower (has_type (fits_in_64 ty)
                         (uadd_overflow_trap (simm32_from_value a) b tc)))
      (with_flags
        (x64_add_with_flags_paired ty b a)
        (trap_if (CC.B) tc)))

(rule 2 (lower (has_type (fits_in_64 ty)
                         (uadd_overflow_trap (sinkable_load a) b tc)))
      (with_flags
        (x64_add_with_flags_paired ty b a)
        (trap_if (CC.B) tc)))

;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; N.B.: the Ret itself is generated by the ABI.
(rule (lower (return args))
      (lower_return args))

;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -2 (lower (icmp cc a @ (value_type (fits_in_64 ty)) b))
      (lower_icmp_bool (emit_cmp cc a b)))

(rule -1 (lower (icmp cc a @ (value_type $I128) b))
      (lower_icmp_bool (emit_cmp cc a b)))

;; Peephole optimization for `x < 0`, when x is a signed 64 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0))))
      (x64_shr $I64 x (Imm8Reg.Imm8 63)))

;; Peephole optimization for `0 > x`, when x is a signed 64 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64))))
      (x64_shr $I64 x (Imm8Reg.Imm8 63)))

;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64))))
      (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63)))

;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0))))
      (x64_shr $I64 (x64_not $I64 x) (Imm8Reg.Imm8 63)))

;; Peephole optimization for `x < 0`, when x is a signed 32 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0))))
      (x64_shr $I32 x (Imm8Reg.Imm8 31)))

;; Peephole optimization for `0 > x`, when x is a signed 32 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32))))
      (x64_shr $I32 x (Imm8Reg.Imm8 31)))

;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32))))
      (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31)))

;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value
(rule 2 (lower (has_type $I8 (icmp (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0))))
      (x64_shr $I32 (x64_not $I64 x) (Imm8Reg.Imm8 31)))

;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than
;; one. To note: what is different here about the output values is that each
;; lane will be filled with all 1s or all 0s according to the comparison,
;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits
;; unset).
(rule (lower (icmp (IntCC.Equal) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpeq ty a b))

;; To lower a not-equals comparison, we perform an equality comparison
;; (PCMPEQ*) and then invert the bits (PXOR with all 1s).
(rule (lower (icmp (IntCC.NotEqual) a @ (value_type (ty_vec128 ty)) b))
      (let ((checked Xmm (x64_pcmpeq ty a b))
            (all_ones Xmm (vector_all_ones)))
           (x64_pxor checked all_ones)))

;; SSE `sgt`

(rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty a b))

;; SSE `slt`

(rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_pcmpgt ty b a))

;; SSE `ugt`

;; N.B.: we must manually prevent load coalescing operands; the
;; register allocator gets confused otherwise.
(rule 1 (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
        (if-let true (has_pmaxu ty))
        (let ((a Xmm a)
              (b Xmm b)
              (max Xmm (x64_pmaxu ty a b))
              (eq Xmm (x64_pcmpeq ty max b)))
             (x64_pxor eq (vector_all_ones))))

;; Flip the upper bit of each lane so the result of a signed comparison is the
;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more)
(rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
      (let ((mask Xmm (flip_high_bit_mask ty))
            (a_masked Xmm (x64_pxor a mask))
            (b_masked Xmm (x64_pxor b mask)))
           (x64_pcmpgt ty a_masked b_masked)))

;; SSE `ult`

(rule 1 (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
        (if-let true (has_pminu ty))
        ;; N.B.: see note above.
        (let ((a Xmm a)
              (b Xmm b)
              (min Xmm (x64_pminu ty a b))
              (eq Xmm (x64_pcmpeq ty min b)))
             (x64_pxor eq (vector_all_ones))))

;; Flip the upper bit of `a` and `b` so the signed comparison result will
;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more).
(rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
      (let ((mask Xmm (flip_high_bit_mask ty))
            (a_masked Xmm (x64_pxor a mask))
            (b_masked Xmm (x64_pxor b mask)))
           (x64_pcmpgt ty b_masked a_masked)))

;; SSE `sge`

;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`.
(rule 1 (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
        (if-let true (has_pmaxs ty))
        (x64_pcmpeq ty a (x64_pmaxs ty a b)))

;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the
;; result.
(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones)))

;; SSE `sle`

;; With `pmins*` use that and compare the result to `a`.
(rule 1 (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
        (if-let true (has_pmins ty))
        (x64_pcmpeq ty a (x64_pmins ty a b)))

;; Without `pmins*` perform a greater-than test and invert the result.
(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones)))

;; SSE `uge`

(rule 2 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
        (if-let true (has_pmaxu ty))
        (x64_pcmpeq ty a (x64_pmaxu ty a b)))

;; Perform a saturating subtract of `a` from `b` and if the result is zero then
;; `a` is greater or equal.
(rule 1 (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b))
         (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8)))

;; Flip the upper bit of each lane so the signed comparison is the same as
;; an unsigned one and then invert the result. See docs on `pcmpgt` for why
;; flipping the upper bit works.
(rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (let (
          (mask Xmm (flip_high_bit_mask ty))
          (a_masked Xmm (x64_pxor a mask))
          (b_masked Xmm (x64_pxor b mask))
          (cmp Xmm (x64_pcmpgt ty b_masked a_masked))
        )
        (x64_pxor cmp (vector_all_ones))))

;; SSE `ule`

(rule 2 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
        (if-let true (has_pminu ty))
        (x64_pcmpeq ty a (x64_pminu ty a b)))

;; A saturating subtraction will produce zeros if `a` is less than `b`, so
;; compare that result to an all-zeros result to figure out lanes of `a` that
;; are <= to the lanes in `b`
(rule 1 (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b))
        (let ((zeros_if_a_is_min Xmm (x64_psubusw a b)))
            (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16))))

;; Flip the upper bit of each lane in `a` and `b` so a signed comparison
;; produces the same result as an unsigned comparison. Then test test for `gt`
;; and invert the result to get the `le` that is desired here. See docs on
;; `pcmpgt` for why flipping the upper bit works.
(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (let (
          (mask Xmm (flip_high_bit_mask ty))
          (a_masked Xmm (x64_pxor a mask))
          (b_masked Xmm (x64_pxor b mask))
          (cmp Xmm (x64_pcmpgt ty a_masked b_masked))
        )
        (x64_pxor cmp (vector_all_ones))))

;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; CLIF's `fcmp` instruction always operates on XMM registers--both scalar and
;; vector. For the scalar versions, we use the flag-setting behavior of the
;; `UCOMIS*` instruction to `SETcc` a 0 or 1 in a GPR register. Note that CLIF's
;; `select` uses the same kind of flag-setting behavior but chooses values other
;; than 0 or 1.
;;
;; Checking the result of `UCOMIS*` is unfortunately difficult in some cases
;; because we do not have `SETcc` instructions that explicitly check
;; simultaneously for the condition (i.e., `eq`, `le`, `gt`, etc.) *and*
;; orderedness. Instead, we must check the flags multiple times. The UCOMIS*
;; documentation (see Intel's Software Developer's Manual, volume 2, chapter 4)
;; is helpful:
;;  - unordered assigns    Z = 1, P = 1, C = 1
;;  - greater than assigns Z = 0, P = 0, C = 0
;;  - less than assigns    Z = 0, P = 0, C = 1
;;  - equal assigns        Z = 1, P = 0, C = 0

(rule -1 (lower (fcmp cc a @ (value_type (ty_scalar_float ty)) b))
      (lower_fcmp_bool (emit_fcmp cc a b)))

;; For vector lowerings, we use `CMPP*` instructions with a 3-bit operand that
;; determines the comparison to make. Note that comparisons that succeed will
;; fill the lane with 1s; comparisons that do not will fill the lane with 0s.

(rule (lower (fcmp (FloatCC.Equal) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.Equal)))
(rule (lower (fcmp (FloatCC.NotEqual) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.NotEqual)))
(rule (lower (fcmp (FloatCC.LessThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.LessThan)))
(rule (lower (fcmp (FloatCC.LessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.LessThanOrEqual)))
(rule (lower (fcmp (FloatCC.Ordered) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.Ordered)))
(rule (lower (fcmp (FloatCC.Unordered) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.Unordered)))
(rule (lower (fcmp (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThan)))
(rule (lower (fcmp (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThanOrEqual)))

;; Some vector lowerings rely on flipping the operands and using a reversed
;; comparison code.

(rule (lower (fcmp (FloatCC.GreaterThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty b a (FcmpImm.LessThan)))
(rule (lower (fcmp (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty b a (FcmpImm.LessThanOrEqual)))
(rule (lower (fcmp (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThan)))
(rule (lower (fcmp (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
      (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThanOrEqual)))

;; Some vector lowerings are simply not supported for certain codes:
;; - FloatCC::OrderedNotEqual
;; - FloatCC::UnorderedOrEqual

;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; When a `select` has an `fcmp` as a condition then rely on `emit_fcmp` to
;; figure out how to perform the comparison.
;;
;; Note, though, that the `FloatCC.Equal` requires an "and" to happen for two
;; condition codes which isn't the easiest thing to lower to a `cmove`
;; instruction. For this reason a `select (fcmp eq ..) ..` is instead
;; flipped around to be `select (fcmp ne ..) ..` with all operands reversed.
;; This will produce a `FcmpCondResult.OrCondition` which is easier to codegen
;; for.
(rule (lower (has_type ty (select (maybe_uextend (fcmp cc a b)) x y)))
      (lower_select_fcmp ty (emit_fcmp cc a b) x y))
(rule 1 (lower (has_type ty (select (maybe_uextend (fcmp (FloatCC.Equal) a b)) x y)))
        (lower_select_fcmp ty (emit_fcmp (FloatCC.NotEqual) a b) y x))

(decl lower_select_fcmp (Type FcmpCondResult Value Value) InstOutput)
(rule (lower_select_fcmp ty (FcmpCondResult.Condition flags cc) x y)
      (with_flags flags (cmove_from_values ty cc x y)))
(rule (lower_select_fcmp ty (FcmpCondResult.OrCondition flags cc1 cc2) x y)
      (with_flags flags (cmove_or_from_values ty cc1 cc2 x y)))

;; We also can lower `select`s that depend on an `icmp` test, but more simply
;; than the `fcmp` variants above. In these cases, we lower to a `CMP`
;; instruction plus a `CMOV`; recall that `cmove_from_values` here may emit more
;; than one instruction for certain types (e.g., XMM-held, I128).

(rule (lower (has_type ty (select (maybe_uextend (icmp cc a b)) x y)))
      (lower_select_icmp ty (emit_cmp cc a b) x y))

;; Finally, we lower `select` from a condition value `c`. These rules are meant
;; to be the final, default lowerings if no other patterns matched above.

(rule -1 (lower (has_type ty (select c @ (value_type (fits_in_64 a_ty)) x y)))
      (let ((size OperandSize (raw_operand_size_of_type a_ty))
            ;; N.B.: disallow load-op fusion, see above. TODO:
            ;; https://github.com/bytecodealliance/wasmtime/issues/3953.
            (gpr_c Gpr (put_in_gpr c)))
           (with_flags (x64_test size gpr_c gpr_c) (cmove_from_values ty (CC.NZ) x y))))

(rule -2 (lower (has_type ty (select c @ (value_type $I128) x y)))
      (let ((cond_result IcmpCondResult (cmp_zero_i128 (CC.Z) c)))
        (select_icmp cond_result x y)))

(decl lower_select_icmp (Type IcmpCondResult Value Value) InstOutput)
(rule (lower_select_icmp ty (IcmpCondResult.Condition flags cc) x y)
      (with_flags flags (cmove_from_values ty cc x y)))

;; Specializations for floating-point compares to generate a `mins*` or a
;; `maxs*` instruction. These are equivalent to the "pseudo-m{in,ax}"
;; specializations for vectors.
(rule 2 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y)))
        (x64_minss x y))
(rule 2 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) x y)) x y)))
        (x64_minsd x y))
(rule 3 (lower (has_type $F32 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y)))
        (x64_maxss x y))
(rule 3 (lower (has_type $F64 (select (maybe_uextend (fcmp (FloatCC.LessThan) y x)) x y)))
        (x64_maxsd x y))

;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 2 (lower (has_type (ty_32_or_64 ty) (clz src)))
      (do_clz ty ty src))

(rule 1 (lower (has_type (ty_8_or_16 ty) (clz src)))
      (let ((extended Gpr (extend_to_gpr src $I64 (ExtendKind.Zero)))
            (clz Gpr (do_clz $I64 $I64 extended)))
        (x64_sub $I64 clz (RegMemImm.Imm (u32_sub 64 (ty_bits ty))))))


(rule 0 (lower
       (has_type $I128
                 (clz src)))
      (let ((upper Gpr (do_clz $I64 $I64 (value_regs_get_gpr src 1)))
            (lower Gpr (x64_add $I64
                            (do_clz $I64 $I64 (value_regs_get_gpr src 0))
                            (RegMemImm.Imm 64)))
            (result_lo Gpr
              (with_flags_reg
               (x64_cmp_imm (OperandSize.Size64) upper 64)
               (cmove $I64 (CC.NZ) upper lower))))
        (value_regs result_lo (imm $I64 0))))

;; Implementation helper for clz; operates on 32 or 64-bit units.
(decl do_clz (Type Type Gpr) Gpr)

;; If available, we can use a plain lzcnt instruction here. Note no
;; special handling is required for zero inputs, because the machine
;; instruction does what the CLIF expects for zero, i.e. it returns
;; zero.
(rule 1 (do_clz ty orig_ty src)
      (if-let true (use_lzcnt))
      (x64_lzcnt ty src))

(rule 0 (do_clz ty orig_ty src)
      (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1)))
            (bits_minus_1 Reg (imm ty (u64_sub (ty_bits_u64 orig_ty) 1))))
        (x64_sub ty bits_minus_1 highest_bit_index)))

;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 2 (lower (has_type (ty_32_or_64 ty) (ctz src)))
      (do_ctz ty ty src))

(rule 1 (lower (has_type (ty_8_or_16 ty) (ctz src)))
      (let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Zero)))
            (stopbit Gpr (x64_or $I32 extended (RegMemImm.Imm (u32_shl 1 (ty_bits ty))))))
        (do_ctz $I32 ty stopbit)))

(rule 0 (lower
       (has_type $I128
                 (ctz src)))
      (let ((lower Gpr (do_ctz $I64 $I64 (value_regs_get_gpr src 0)))
            (upper Gpr (x64_add $I64
                            (do_ctz $I64 $I64 (value_regs_get_gpr src 1))
                            (RegMemImm.Imm 64)))
            (result_lo Gpr
              (with_flags_reg
               (x64_cmp_imm (OperandSize.Size64) lower 64)
               (cmove $I64 (CC.Z) upper lower))))
        (value_regs result_lo (imm $I64 0))))

(decl do_ctz (Type Type Gpr) Gpr)

;; Analogous to `clz` cases above, but using mirror instructions
;; (tzcnt vs lzcnt, bsf vs bsr).
(rule 1 (do_ctz ty orig_ty src)
      (if-let true (use_bmi1))
      (x64_tzcnt ty src))

(rule 0 (do_ctz ty orig_ty src)
      (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty))))

;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt src)))
      (if-let true (use_popcnt))
      (x64_popcnt ty src))

(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt src)))
      (if-let true (use_popcnt))
      (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))

(rule 1 (lower (has_type $I128 (popcnt src)))
      (if-let true (use_popcnt))
      (let ((lo_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 0)))
            (hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1))))
        (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0))))

(rule -1 (lower
       (has_type (ty_32_or_64 ty)
                 (popcnt src)))
      (do_popcnt ty src))

(rule -2 (lower
       (has_type (ty_8_or_16 ty)
                 (popcnt src)))
      (do_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))

(rule (lower
       (has_type $I128
                 (popcnt src)))
      (let ((lo_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 0)))
            (hi_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 1))))
        (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0))))

;; Implementation of popcount when we don't nave a native popcount
;; instruction.
(decl do_popcnt (Type Gpr) Gpr)
(rule (do_popcnt $I64 src)
      (let ((shifted1 Gpr (x64_shr $I64 src (Imm8Reg.Imm8 1)))
            (sevens Gpr (imm $I64 0x7777777777777777))
            (masked1 Gpr (x64_and $I64 shifted1 sevens))
            ;; diff1 := src - ((src >> 1) & 0b0111_0111_0111...)
            (diff1 Gpr (x64_sub $I64 src masked1))
            (shifted2 Gpr (x64_shr $I64 masked1 (Imm8Reg.Imm8 1)))
            (masked2 Gpr (x64_and $I64 shifted2 sevens))
            ;; diff2 := diff1 - ((diff1 >> 1) & 0b0111_0111_0111...)
            (diff2 Gpr (x64_sub $I64 diff1 masked2))
            (shifted3 Gpr (x64_shr $I64 masked2 (Imm8Reg.Imm8 1)))
            (masked3 Gpr (x64_and $I64 shifted3 sevens))
            ;; diff3 := diff2 - ((diff2 >> 1) & 0b0111_0111_0111...)
            ;;
            ;; At this point, each nibble of diff3 is the popcount of
            ;; that nibble. This works because at each step above, we
            ;; are basically subtracting floor(value / 2) from the
            ;; running value; the leftover remainder is 1 if the LSB
            ;; was 1. After three steps, we have (nibble / 8) -- 0 or
            ;; 1 for the MSB of the nibble -- plus three possible
            ;; additions for the three other bits.
            (diff3 Gpr (x64_sub $I64 diff2 masked3))
            ;; Add the two nibbles of each byte together.
            (sum1 Gpr (x64_add $I64
                           (x64_shr $I64 diff3 (Imm8Reg.Imm8 4))
                           diff3))
            ;; Mask the above sum to have the popcount for each byte
            ;; in the lower nibble of that byte.
            (ofof Gpr (imm $I64 0x0f0f0f0f0f0f0f0f))
            (masked4 Gpr (x64_and $I64 sum1 ofof))
            (ones Gpr (imm $I64 0x0101010101010101))
            ;; Use a multiply to sum all of the bytes' popcounts into
            ;; the top byte. Consider the binomial expansion for the
            ;; top byte: it is the sum of the bytes (masked4 >> 56) *
            ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01
            ;; + ... + (masked4 >> 0).
            (mul Gpr (x64_imul $I64 masked4 ones))
            ;; Now take that top byte and return it as the popcount.
            (final Gpr (x64_shr $I64 mul (Imm8Reg.Imm8 56))))
        final))

;; This is the 32-bit version of the above; the steps for each nibble
;; are the same, we just use constants half as wide.
(rule (do_popcnt $I32 src)
      (let ((shifted1 Gpr (x64_shr $I32 src (Imm8Reg.Imm8 1)))
            (sevens Gpr (imm $I32 0x77777777))
            (masked1 Gpr (x64_and $I32 shifted1 sevens))
            (diff1 Gpr (x64_sub $I32 src masked1))
            (shifted2 Gpr (x64_shr $I32 masked1 (Imm8Reg.Imm8 1)))
            (masked2 Gpr (x64_and $I32 shifted2 sevens))
            (diff2 Gpr (x64_sub $I32 diff1 masked2))
            (shifted3 Gpr (x64_shr $I32 masked2 (Imm8Reg.Imm8 1)))
            (masked3 Gpr (x64_and $I32 shifted3 sevens))
            (diff3 Gpr (x64_sub $I32 diff2 masked3))
            (sum1 Gpr (x64_add $I32
                           (x64_shr $I32 diff3 (Imm8Reg.Imm8 4))
                           diff3))
            (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f)))
            (mul Gpr (x64_imul_imm $I32 masked4 0x01010101))
            (final Gpr (x64_shr $I32 mul (Imm8Reg.Imm8 24))))
        final))


(rule 2 (lower (has_type $I8X16 (popcnt src)))
      (if-let true (use_avx512vl))
      (if-let true (use_avx512bitalg))
      (x64_vpopcntb src))


;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf):
;;
;; __m128i count_bytes ( __m128i v) {
;;     __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
;;     __m128i low_mask = _mm_set1_epi8 (0x0f);
;;     __m128i lo = _mm_and_si128 (v, low_mask);
;;     __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
;;     __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo);
;;     __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi);
;;     return _mm_add_epi8 (cnt1, cnt2);
;; }
;;
;; Details of the above algorithm can be found in the reference noted above, but the basics
;; are to create a lookup table that pre populates the popcnt values for each number [0,15].
;; The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
;; lookup process, and adds together the results.
;;
;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);


(rule 1 (lower (has_type $I8X16 (popcnt src)))
      (if-let true (use_ssse3))
      (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
            (low_nibbles Xmm (sse_and $I8X16 src low_mask))
            ;; Note that this is a 16x8 shift, but that's OK; we mask
            ;; off anything that traverses from one byte to the next
            ;; with the low_mask below.
            (shifted_src Xmm (x64_psrlw src (xmi_imm 4)))
            (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask))
            (lookup Xmm (x64_xmm_load_const $I8X16
              (emit_u128_le_const 0x04030302_03020201_03020201_02010100)))
            (bit_counts_low Xmm (x64_pshufb lookup low_nibbles))
            (bit_counts_high Xmm (x64_pshufb lookup high_nibbles)))
        (x64_paddb bit_counts_low bit_counts_high)))

;; A modified version of the popcnt method from Hacker's Delight.
(rule (lower (has_type $I8X16 (popcnt src)))
      (let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777))
            (src Xmm src)
            (shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1))
            (src Xmm (x64_psubb src shifted))
            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
            (src Xmm (x64_psubb src shifted))
            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
            (src Xmm (x64_psubb src shifted))
            (src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4)))))
        (x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))))

;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8 (bitrev src)))
      (do_bitrev8 $I32 src))

(rule (lower (has_type $I16 (bitrev src)))
      (do_bitrev16 $I32 src))

(rule (lower (has_type $I32 (bitrev src)))
      (do_bitrev32 $I32 src))

(rule (lower (has_type $I64 (bitrev src)))
      (do_bitrev64 $I64 src))

(rule (lower (has_type $I128 (bitrev src)))
      (value_regs
       (do_bitrev64 $I64 (value_regs_get_gpr src 1))
       (do_bitrev64 $I64 (value_regs_get_gpr src 0))))

(decl do_bitrev8 (Type Gpr) Gpr)
(rule (do_bitrev8 ty src)
      (let ((tymask u64 (ty_mask ty))
            (mask1 Gpr (imm ty (u64_and tymask 0x5555555555555555)))
            (lo1 Gpr (x64_and ty src mask1))
            (hi1 Gpr (x64_and ty (x64_shr ty src (Imm8Reg.Imm8 1)) mask1))
            (swap1 Gpr (x64_or ty
                           (x64_shl ty lo1 (Imm8Reg.Imm8 1))
                           hi1))
            (mask2 Gpr (imm ty (u64_and tymask 0x3333333333333333)))
            (lo2 Gpr (x64_and ty swap1 mask2))
            (hi2 Gpr (x64_and ty (x64_shr ty swap1 (Imm8Reg.Imm8 2)) mask2))
            (swap2 Gpr (x64_or ty
                           (x64_shl ty lo2 (Imm8Reg.Imm8 2))
                           hi2))
            (mask4 Gpr (imm ty (u64_and tymask 0x0f0f0f0f0f0f0f0f)))
            (lo4 Gpr (x64_and ty swap2 mask4))
            (hi4 Gpr (x64_and ty (x64_shr ty swap2 (Imm8Reg.Imm8 4)) mask4))
            (swap4 Gpr (x64_or ty
                           (x64_shl ty lo4 (Imm8Reg.Imm8 4))
                           hi4)))
        swap4))

(decl do_bitrev16 (Type Gpr) Gpr)
(rule (do_bitrev16 ty src)
      (let ((src_ Gpr (do_bitrev8 ty src))
            (tymask u64 (ty_mask ty))
            (mask8 Gpr (imm ty (u64_and tymask 0x00ff00ff00ff00ff)))
            (lo8 Gpr (x64_and ty src_ mask8))
            (hi8 Gpr (x64_and ty (x64_shr ty src_ (Imm8Reg.Imm8 8)) mask8))
            (swap8 Gpr (x64_or ty
                           (x64_shl ty lo8 (Imm8Reg.Imm8 8))
                           hi8)))
        swap8))

(decl do_bitrev32 (Type Gpr) Gpr)
(rule (do_bitrev32 ty src)
      (let ((src_ Gpr (do_bitrev16 ty src))
            (tymask u64 (ty_mask ty))
            (mask16 Gpr (imm ty (u64_and tymask 0x0000ffff0000ffff)))
            (lo16 Gpr (x64_and ty src_ mask16))
            (hi16 Gpr (x64_and ty (x64_shr ty src_ (Imm8Reg.Imm8 16)) mask16))
            (swap16 Gpr (x64_or ty
                            (x64_shl ty lo16 (Imm8Reg.Imm8 16))
                            hi16)))
        swap16))

(decl do_bitrev64 (Type Gpr) Gpr)
(rule (do_bitrev64 ty @ $I64 src)
      (let ((src_ Gpr (do_bitrev32 ty src))
            (mask32 Gpr (imm ty 0xffffffff))
            (lo32 Gpr (x64_and ty src_ mask32))
            (hi32 Gpr (x64_shr ty src_ (Imm8Reg.Imm8 32)))
            (swap32 Gpr (x64_or ty
                            (x64_shl ty lo32 (Imm8Reg.Imm8 32))
                            hi32)))
        swap32))

;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; x64 bswap instruction is only for 32- or 64-bit swaps
;; implement the 16-bit swap as a rotl by 8
(rule (lower (has_type $I16 (bswap src)))
      (x64_rotl $I16 src (Imm8Reg.Imm8 8)))

(rule (lower (has_type $I32 (bswap src)))
      (x64_bswap $I32 src))

(rule (lower (has_type $I64 (bswap src)))
      (x64_bswap $I64 src))

(rule (lower (has_type $I128 (bswap src)))
      (value_regs
       (x64_bswap $I64 (value_regs_get_gpr src 1))
       (x64_bswap $I64 (value_regs_get_gpr src 0))))

;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; I{8,16,32,64} -> I128.
(rule (lower (has_type $I128 (uextend src)))
      (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0)))

;; I{8,16,32} -> I64.
(rule (lower (has_type $I64 (uextend src)))
      (extend_to_gpr src $I64 (ExtendKind.Zero)))

;; I{8,16} -> I32
;; I8 -> I16
(rule -1 (lower (has_type (fits_in_32 _) (uextend src)))
         (extend_to_gpr src $I32 (ExtendKind.Zero)))

;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; I{8,16,32} -> I128.
;;
;; Produce upper 64 bits sign-extended from lower 64: shift right by
;; 63 bits to spread the sign bit across the result.
(rule (lower (has_type $I128 (sextend src)))
      (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign)))
            (hi Gpr (x64_sar $I64 lo (Imm8Reg.Imm8 63))))
      (value_regs lo hi)))

;; I{8,16,32} -> I64.
(rule (lower (has_type $I64 (sextend src)))
      (extend_to_gpr src $I64 (ExtendKind.Sign)))

;; I{8,16} -> I32
;; I8 -> I16
(rule -1 (lower (has_type (fits_in_32 _) (sextend src)))
         (extend_to_gpr src $I32 (ExtendKind.Sign)))

;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; T -> T is always a no-op, even I128 -> I128.
(rule (lower (has_type ty (ireduce src @ (value_type ty))))
      src)

;; T -> I{64,32,16,8}: We can simply pass through the value: values
;; are always stored with high bits undefined, so we can just leave
;; them be.
(rule 1 (lower (has_type (fits_in_64 ty) (ireduce src)))
      (value_regs_get_gpr src 0))

;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (debugtrap))
      (side_effect (x64_hlt)))

;; Rules for `x86_pmaddubsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I16X8 (x86_pmaddubsw x y)))
      (if-let true (use_ssse3))
      (x64_pmaddubsw y x))

;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fadd x y)))
      (x64_addss x y))
(rule (lower (has_type $F64 (fadd x y)))
      (x64_addsd x y))
(rule (lower (has_type $F32X4 (fadd x y)))
      (x64_addps x y))
(rule (lower (has_type $F64X2 (fadd x y)))
      (x64_addpd x y))

;; The above rules automatically sink loads for rhs operands, so additionally
;; add rules for sinking loads with lhs operands.
(rule 1 (lower (has_type $F32 (fadd (sinkable_load x) y)))
      (x64_addss y x))
(rule 1 (lower (has_type $F64 (fadd (sinkable_load x) y)))
      (x64_addsd y x))
(rule 1 (lower (has_type $F32X4 (fadd (sinkable_load x) y)))
      (x64_addps y x))
(rule 1 (lower (has_type $F64X2 (fadd (sinkable_load x) y)))
      (x64_addpd y x))

;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fsub x y)))
      (x64_subss x y))
(rule (lower (has_type $F64 (fsub x y)))
      (x64_subsd x y))
(rule (lower (has_type $F32X4 (fsub x y)))
      (x64_subps x y))
(rule (lower (has_type $F64X2 (fsub x y)))
      (x64_subpd x y))

;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fmul x y)))
      (x64_mulss x y))
(rule (lower (has_type $F64 (fmul x y)))
      (x64_mulsd x y))
(rule (lower (has_type $F32X4 (fmul x y)))
      (x64_mulps x y))
(rule (lower (has_type $F64X2 (fmul x y)))
      (x64_mulpd x y))

;; The above rules automatically sink loads for rhs operands, so additionally
;; add rules for sinking loads with lhs operands.
(rule 1 (lower (has_type $F32 (fmul (sinkable_load x) y)))
      (x64_mulss y x))
(rule 1 (lower (has_type $F64 (fmul (sinkable_load x) y)))
      (x64_mulsd y x))
(rule 1 (lower (has_type $F32X4 (fmul (sinkable_load x) y)))
      (x64_mulps y x))
(rule 1 (lower (has_type $F64X2 (fmul (sinkable_load x) y)))
      (x64_mulpd y x))

;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fdiv x y)))
      (x64_divss x y))
(rule (lower (has_type $F64 (fdiv x y)))
      (x64_divsd x y))
(rule (lower (has_type $F32X4 (fdiv x y)))
      (x64_divps x y))
(rule (lower (has_type $F64X2 (fdiv x y)))
      (x64_divpd x y))

;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (sqrt x)))
      (x64_sqrtss (xmm_zero $F32X4) x))
(rule (lower (has_type $F64 (sqrt x)))
      (x64_sqrtsd (xmm_zero $F64X2) x))
(rule (lower (has_type $F32X4 (sqrt x)))
      (x64_sqrtps x))
(rule (lower (has_type $F64X2 (sqrt x)))
      (x64_sqrtpd x))

;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F64 (fpromote x)))
      (x64_cvtss2sd (xmm_zero $F64X2) x))

;; Rules for `fvpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F64X2 (fvpromote_low x)))
      (x64_cvtps2pd (put_in_xmm x)))

;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32 (fdemote x)))
      (x64_cvtsd2ss (xmm_zero $F32X4) x))

;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type $F32X4 (fvdemote x)))
      (x64_cvtpd2ps x))

;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fmin x y)))
      (xmm_min_max_seq $F32 true x y))
(rule (lower (has_type $F64 (fmin x y)))
      (xmm_min_max_seq $F64 true x y))

;; Vector-typed version. We don't use single pseudoinstructions as
;; above, because we don't need to generate a mini-CFG. Instead, we
;; perform a branchless series of operations.
;;
;; We cannot simply use native min instructions (minps, minpd) because
;; NaN handling is different per CLIF semantics than on
;; x86. Specifically, if an argument is NaN, or the arguments are both
;; zero but of opposite signs, then the x86 instruction always
;; produces the second argument. However, per CLIF semantics, we
;; require that fmin(NaN, _) = fmin(_, NaN) = NaN, and fmin(+0, -0) =
;; fmin(-0, +0) = -0.

(rule (lower (has_type $F32X4 (fmin x y)))
      ;; Compute min(x, y) and min(y, x) with native
      ;; instructions. These will differ in one of the edge cases
      ;; above that we have to handle properly. (Conversely, if they
      ;; don't differ, then the native instruction's answer is the
      ;; right one per CLIF semantics.)
      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
            (y Xmm y)
            (min1 Xmm (x64_minps x y))
            (min2 Xmm (x64_minps y x))
            ;; Compute the OR of the two. Note that NaNs have an
            ;; exponent field of all-ones (0xFF for F32), so if either
            ;; result is a NaN, this OR will be. And if either is a
            ;; zero (which has an exponent of 0 and mantissa of 0),
            ;; this captures a sign-bit of 1 (negative) if either
            ;; input is negative.
            ;;
            ;; In the case where we don't have a +/-0 mismatch or
            ;; NaNs, then `min1` and `min2` are equal and `min_or` is
            ;; the correct minimum.
            (min_or Xmm (x64_orps min1 min2))
            ;; "compare unordered" produces a true mask (all ones) in
            ;; a given lane if the min is a NaN. We use this to
            ;; generate a mask to ensure quiet NaNs.
            (is_nan_mask Xmm (x64_cmpps min_or min2 (FcmpImm.Unordered)))
            ;; OR in the NaN mask.
            (min_or_2 Xmm (x64_orps min_or is_nan_mask))
            ;; Shift the NaN mask down so that it covers just the
            ;; fraction below the NaN signalling bit; we'll use this
            ;; to mask off non-canonical NaN payloads.
            ;;
            ;; All-ones for NaN, shifted down to leave 10 top bits (1
            ;; sign, 8 exponent, 1 QNaN bit that must remain set)
            ;; cleared.
            (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10)))
            ;; Do a NAND, so that we retain every bit not set in
            ;; `nan_fraction_mask`. This mask will be all zeroes (so
            ;; we retain every bit) in non-NaN cases, and will have
            ;; ones (so we clear those bits) in NaN-payload bits
            ;; otherwise.
            (final Xmm (x64_andnps nan_fraction_mask min_or_2)))
        final))

;; Likewise for F64 lanes, except that the right-shift is by 13 bits
;; (1 sign, 11 exponent, 1 QNaN bit).
(rule (lower (has_type $F64X2 (fmin x y)))
      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
            (y Xmm y)
            (min1 Xmm (x64_minpd x y))
            (min2 Xmm (x64_minpd y x))
            (min_or Xmm (x64_orpd min1 min2))
            (is_nan_mask Xmm (x64_cmppd min1 min2 (FcmpImm.Unordered)))
            (min_or_2 Xmm (x64_orpd min_or is_nan_mask))
            (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13)))
            (final Xmm (x64_andnpd nan_fraction_mask min_or_2)))
        final))

;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fmax x y)))
      (xmm_min_max_seq $F32 false x y))
(rule (lower (has_type $F64 (fmax x y)))
      (xmm_min_max_seq $F64 false x y))

;; The vector version of fmax here is a dual to the fmin sequence
;; above, almost, with a few differences.

(rule (lower (has_type $F32X4 (fmax x y)))
      ;; Compute max(x, y) and max(y, x) with native
      ;; instructions. These will differ in one of the edge cases
      ;; above that we have to handle properly. (Conversely, if they
      ;; don't differ, then the native instruction's answer is the
      ;; right one per CLIF semantics.)
      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
            (y Xmm y)
            (max1 Xmm (x64_maxps x y))
            (max2 Xmm (x64_maxps y x))
            ;; Compute the XOR of the two maxima. In the case
            ;; where we don't have a +/-0 mismatch or NaNs, then
            ;; `min1` and `min2` are equal and this XOR is zero.
            (max_xor Xmm (x64_xorps max1 max2))
            ;; OR the XOR into one of the original maxima. If they are
            ;; equal, this does nothing. If max2 was NaN, its exponent
            ;; bits were all-ones, so the xor's exponent bits were the
            ;; complement of max1, and the OR of max1 and max_xor has
            ;; an all-ones exponent (is a NaN). If max1 was NaN, then
            ;; its exponent bits were already all-ones, so the OR will
            ;; be a NaN as well.
            (max_blended_nan Xmm (x64_orps max1 max_xor))
            ;; Subtract the XOR. This ensures that if we had +0 and
            ;; -0, we end up with +0.
            (max_blended_nan_positive Xmm (x64_subps max_blended_nan max_xor))
            ;; "compare unordered" produces a true mask (all ones) in
            ;; a given lane if the min is a NaN. We use this to
            ;; generate a mask to ensure quiet NaNs.
            (is_nan_mask Xmm (x64_cmpps max_blended_nan max_blended_nan (FcmpImm.Unordered)))
            ;; Shift the NaN mask down so that it covers just the
            ;; fraction below the NaN signalling bit; we'll use this
            ;; to mask off non-canonical NaN payloads.
            ;;
            ;; All-ones for NaN, shifted down to leave 10 top bits (1
            ;; sign, 8 exponent, 1 QNaN bit that must remain set)
            ;; cleared.
            (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10)))
            ;; Do a NAND, so that we retain every bit not set in
            ;; `nan_fraction_mask`. This mask will be all zeroes (so
            ;; we retain every bit) in non-NaN cases, and will have
            ;; ones (so we clear those bits) in NaN-payload bits
            ;; otherwise.
            (final Xmm (x64_andnps nan_fraction_mask max_blended_nan_positive)))
        final))

(rule (lower (has_type $F64X2 (fmax x y)))
      ;; Compute max(x, y) and max(y, x) with native
      ;; instructions. These will differ in one of the edge cases
      ;; above that we have to handle properly. (Conversely, if they
      ;; don't differ, then the native instruction's answer is the
      ;; right one per CLIF semantics.)
      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
            (y Xmm y)
            (max1 Xmm (x64_maxpd x y))
            (max2 Xmm (x64_maxpd y x))
            ;; Compute the XOR of the two maxima. In the case
            ;; where we don't have a +/-0 mismatch or NaNs, then
            ;; `min1` and `min2` are equal and this XOR is zero.
            (max_xor Xmm (x64_xorpd max1 max2))
            ;; OR the XOR into one of the original maxima. If they are
            ;; equal, this does nothing. If max2 was NaN, its exponent
            ;; bits were all-ones, so the xor's exponent bits were the
            ;; complement of max1, and the OR of max1 and max_xor has
            ;; an all-ones exponent (is a NaN). If max1 was NaN, then
            ;; its exponent bits were already all-ones, so the OR will
            ;; be a NaN as well.
            (max_blended_nan Xmm (x64_orpd max1 max_xor))
            ;; Subtract the XOR. This ensures that if we had +0 and
            ;; -0, we end up with +0.
            (max_blended_nan_positive Xmm (x64_subpd max_blended_nan max_xor))
            ;; `cmpps` with predicate index `3` is `cmpunordps`, or
            ;; "compare unordered": it produces a true mask (all ones)
            ;; in a given lane if the min is a NaN. We use this to
            ;; generate a mask to ensure quiet NaNs.
            (is_nan_mask Xmm (x64_cmppd max_blended_nan max_blended_nan (FcmpImm.Unordered)))
            ;; Shift the NaN mask down so that it covers just the
            ;; fraction below the NaN signalling bit; we'll use this
            ;; to mask off non-canonical NaN payloads.
            ;;
            ;; All-ones for NaN, shifted down to leave 13 top bits (1
            ;; sign, 11 exponent, 1 QNaN bit that must remain set)
            ;; cleared.
            (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13)))
            ;; Do a NAND, so that we retain every bit not set in
            ;; `nan_fraction_mask`. This mask will be all zeroes (so
            ;; we retain every bit) in non-NaN cases, and will have
            ;; ones (so we clear those bits) in NaN-payload bits
            ;; otherwise.
            (final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive)))
        final))

;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Base case for fma is to call out to one of two libcalls. For vectors they
;; need to be decomposed, handle each element individually, and then recomposed.

(rule (lower (has_type $F32 (fma x y z)))
      (libcall_3 (LibCall.FmaF32) x y z))
(rule (lower (has_type $F64 (fma x y z)))
      (libcall_3 (LibCall.FmaF64) x y z))

(rule (lower (has_type $F32X4 (fma x y z)))
      (let (
          (x Xmm (put_in_xmm x))
          (y Xmm (put_in_xmm y))
          (z Xmm (put_in_xmm z))
          (x0 Xmm (libcall_3 (LibCall.FmaF32) x y z))
          (x1 Xmm (libcall_3 (LibCall.FmaF32)
            (x64_pshufd x 1)
            (x64_pshufd y 1)
            (x64_pshufd z 1)))
          (x2 Xmm (libcall_3 (LibCall.FmaF32)
            (x64_pshufd x 2)
            (x64_pshufd y 2)
            (x64_pshufd z 2)))
          (x3 Xmm (libcall_3 (LibCall.FmaF32)
            (x64_pshufd x 3)
            (x64_pshufd y 3)
            (x64_pshufd z 3)))

          (tmp Xmm (f32x4_insertlane x0 x1 1))
          (tmp Xmm (f32x4_insertlane tmp x2 2))
          (tmp Xmm (f32x4_insertlane tmp x3 3))
        )
        tmp))
(rule (lower (has_type $F64X2 (fma x y z)))
      (let (
          (x Xmm (put_in_xmm x))
          (y Xmm (put_in_xmm y))
          (z Xmm (put_in_xmm z))
          (x0 Xmm (libcall_3 (LibCall.FmaF64) x y z))
          (x1 Xmm (libcall_3 (LibCall.FmaF64)
            (x64_pshufd x 0xee)
            (x64_pshufd y 0xee)
            (x64_pshufd z 0xee)))
        )
        (x64_movlhps x0 x1)))


;; Special case for when the `fma` feature is active and a native instruction
;; can be used.
(rule 1 (lower (has_type ty (fma x y z)))
      (if-let true (use_fma))
      (fmadd ty x y z))

(decl fmadd (Type Value Value Value) Xmm)
(decl fnmadd (Type Value Value Value) Xmm)

;; Base case. Note that this will automatically sink a load with `z`, the value
;; to add.
(rule (fmadd ty x y z) (x64_vfmadd213 ty x y z))

;; Allow sinking loads with one of the two values being multiplied in addition
;; to the value being added. Note that both x and y can be sunk here due to
;; multiplication being commutative.
(rule 1 (fmadd ty (sinkable_load x) y z) (x64_vfmadd132 ty y z x))
(rule 2 (fmadd ty x (sinkable_load y) z) (x64_vfmadd132 ty x z y))

;; If one of the values being multiplied is negated then use a `vfnmadd*`
;; instruction instead
(rule 3 (fmadd ty (fneg x) y z) (fnmadd ty x y z))
(rule 4 (fmadd ty x (fneg y) z) (fnmadd ty x y z))

(rule (fnmadd ty x y z) (x64_vfnmadd213 ty x y z))
(rule 1 (fnmadd ty (sinkable_load x) y z) (x64_vfnmadd132 ty y z x))
(rule 2 (fnmadd ty x (sinkable_load y) z) (x64_vfnmadd132 ty x z y))

;; Like `fmadd` if one argument is negated switch which one is being codegen'd
(rule 3 (fnmadd ty (fneg x) y z) (fmadd ty x y z))
(rule 4 (fnmadd ty x (fneg y) z) (fmadd ty x y z))


(rule 2 (lower (has_type ty (fma x y (fneg z))))
      (if-let true (use_fma))
      (fmsub ty x y z))

;; fmsub and fnmsub
(decl fmsub (Type Value Value Value) Xmm)
(decl fnmsub (Type Value Value Value) Xmm)

;; Base case, will sink a load of `z` automatically.
(rule (fmsub ty x y z) (x64_vfmsub213 ty x y z))

;; Allow sinking loads with one of the two values being multiplied in addition
;; to the value being subtracted. Note that both x and y can be sunk here due to
;; multiplication being commutative.
(rule 1 (fmsub ty (sinkable_load x) y z) (x64_vfmsub132 ty y z x))
(rule 2 (fmsub ty x (sinkable_load y) z) (x64_vfmsub132 ty x z y))

;; If one of the values being multiplied is negated then use a `vfnmsub*`
;; instruction instead
(rule 3 (fmsub ty (fneg x) y z) (fnmsub ty x y z))
(rule 4 (fmsub ty x (fneg y) z) (fnmsub ty x y z))

(rule (fnmsub ty x y z) (x64_vfnmsub213 ty x y z))
(rule 1 (fnmsub ty (sinkable_load x) y z) (x64_vfnmsub132 ty y z x))
(rule 2 (fnmsub ty x (sinkable_load y) z) (x64_vfnmsub132 ty x z y))

;; Like `fmsub` if one argument is negated switch which one is being codegen'd
(rule 3 (fnmsub ty (fneg x) y z) (fmsub ty x y z))
(rule 4 (fnmsub ty x (fneg y) z) (fmsub ty x y z))


;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; In order to load a value from memory to a GPR register, we may need to extend
;; the loaded value from 8-, 16-, or 32-bits to this backend's expected GPR
;; width: 64 bits. Note that `ext_mode` will load 1-bit types (booleans) as
;; 8-bit loads.
;;
;; By default, we zero-extend all sub-64-bit loads to a GPR.
(rule load_sub64_x64_movzx -4 (lower (has_type (and (fits_in_32 ty) (is_gpr_type _)) (load flags address offset)))
      (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address offset)))
;; But if we know that both the `from` and `to` are 64 bits, we simply load with
;; no extension.
(rule load_64_x64_movzx -1 (lower (has_type (ty_int_ref_64 ty) (load flags address offset)))
      (x64_mov (to_amode flags address offset)))
;; Also, certain scalar loads have a specific `from` width and extension kind
;; (signed -> `sx`, zeroed -> `zx`). We overwrite the high bits of the 64-bit
;; GPR even if the `to` type is smaller (e.g., 16-bits).
(rule (lower (has_type (is_gpr_type ty) (uload8 flags address offset)))
      (x64_movzx (ExtMode.BQ) (to_amode flags address offset)))
(rule (lower (has_type (is_gpr_type ty) (sload8 flags address offset)))
      (x64_movsx (ExtMode.BQ) (to_amode flags address offset)))
(rule (lower (has_type (is_gpr_type ty) (uload16 flags address offset)))
      (x64_movzx (ExtMode.WQ) (to_amode flags address offset)))
(rule (lower (has_type (is_gpr_type ty) (sload16 flags address offset)))
      (x64_movsx (ExtMode.WQ) (to_amode flags address offset)))
(rule (lower (has_type (is_gpr_type ty) (uload32 flags address offset)))
      (x64_movzx (ExtMode.LQ) (to_amode flags address offset)))
(rule (lower (has_type (is_gpr_type ty) (sload32 flags address offset)))
      (x64_movsx (ExtMode.LQ) (to_amode flags address offset)))

;; To load to XMM registers, we use the x64-specific instructions for each type.
;; For `$F32` and `$F64` this is important--we only want to load 32 or 64 bits.
;; But for the 128-bit types, this is not strictly necessary for performance but
;; might help with clarity during disassembly.
(rule (lower (has_type $F16 (load flags address offset)))
      (x64_pinsrw (xmm_uninit_value) (to_amode flags address offset) 0))
(rule (lower (has_type $F32 (load flags address offset)))
      (x64_movss_load (to_amode flags address offset)))
(rule (lower (has_type $F64 (load flags address offset)))
      (x64_movsd_load (to_amode flags address offset)))
(rule (lower (has_type $F128 (load flags address offset)))
      (x64_movdqu_load (to_amode flags address offset)))
(rule (lower (has_type $F32X4 (load flags address offset)))
      (x64_movups_load (to_amode flags address offset)))
(rule (lower (has_type $F64X2 (load flags address offset)))
      (x64_movupd_load (to_amode flags address offset)))
(rule -2 (lower (has_type (ty_vec128 ty) (load flags address offset)))
      (x64_movdqu_load (to_amode flags address offset)))

;; We can load an I128 by doing two 64-bit loads.
(rule -3 (lower (has_type $I128
                       (load flags address offset)))
      (let ((addr_lo Amode (to_amode flags address offset))
            (addr_hi Amode (amode_offset addr_lo 8))
            (value_lo Reg (x64_mov addr_lo))
            (value_hi Reg (x64_mov addr_hi)))
        (value_regs value_lo value_hi)))

;; We also include widening vector loads; these sign- or zero-extend each lane
;; to the next wider width (e.g., 16x4 -> 32x4).
(rule 1 (lower (has_type $I16X8 (sload8x8 flags address offset)))
        (if-let true (use_sse41))
        (x64_pmovsxbw (to_amode flags address offset)))
(rule 1 (lower (has_type $I16X8 (uload8x8 flags address offset)))
        (if-let true (use_sse41))
        (x64_pmovzxbw (to_amode flags address offset)))
(rule 1 (lower (has_type $I32X4 (sload16x4 flags address offset)))
        (if-let true (use_sse41))
        (x64_pmovsxwd (to_amode flags address offset)))
(rule 1 (lower (has_type $I32X4 (uload16x4 flags address offset)))
        (if-let true (use_sse41))
        (x64_pmovzxwd (to_amode flags address offset)))
(rule 1 (lower (has_type $I64X2 (sload32x2 flags address offset)))
        (if-let true (use_sse41))
        (x64_pmovsxdq (to_amode flags address offset)))
(rule 1 (lower (has_type $I64X2 (uload32x2 flags address offset)))
        (if-let true (use_sse41))
        (x64_pmovzxdq (to_amode flags address offset)))

(rule (lower (has_type $I16X8 (sload8x8 flags address offset)))
      (lower_swiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
(rule (lower (has_type $I16X8 (uload8x8 flags address offset)))
      (lower_uwiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
(rule (lower (has_type $I32X4 (sload16x4 flags address offset)))
      (lower_swiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
(rule (lower (has_type $I32X4 (uload16x4 flags address offset)))
      (lower_uwiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
(rule (lower (has_type $I64X2 (sload32x2 flags address offset)))
      (lower_swiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))
(rule (lower (has_type $I64X2 (uload32x2 flags address offset)))
      (lower_uwiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))

;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; 8-, 16-, 32- and 64-bit GPR stores.
(rule store_x64_movrm -2 (lower (store flags
                    value @ (value_type (is_gpr_type ty))
                    address
                    offset))
      (side_effect
       (x64_movrm ty (to_amode flags address offset) value)))

;; Explicit 8/16/32-bit opcodes.
(rule (lower (istore8 flags value address offset))
      (side_effect
       (x64_movrm $I8 (to_amode flags address offset) value)))
(rule (lower (istore16 flags value address offset))
      (side_effect
       (x64_movrm $I16 (to_amode flags address offset) value)))
(rule (lower (istore32 flags value address offset))
      (side_effect
       (x64_movrm $I32 (to_amode flags address offset) value)))

;; IMM stores
(rule 4 (lower (store flags value @ (value_type (fits_in_64 ty)) address offset))
      (if-let (i32_from_iconst imm) value)
      (side_effect
       (x64_movimm_m ty (to_amode flags address offset) imm)))

;; F16 stores of values in XMM registers.
(rule 0 (lower (store flags
                    value @ (value_type $F16)
                    address
                    offset))
      (side_effect
       (x64_movrm $I16 (to_amode flags address offset) (bitcast_xmm_to_gpr 16 value))))

(rule 1 (lower (store flags
                    value @ (value_type $F16)
                    address
                    offset))
      (if-let true (use_sse41))
      (side_effect
       (x64_pextrw_store (to_amode flags address offset) value 0)))

;; F32 stores of values in XMM registers.
(rule 1 (lower (store flags
                    value @ (value_type $F32)
                    address
                    offset))
      (side_effect
       (x64_movss_store (to_amode flags address offset) value)))

;; F64 stores of values in XMM registers.
(rule 1 (lower (store flags
                    value @ (value_type $F64)
                    address
                    offset))
      (side_effect
       (x64_movsd_store (to_amode flags address offset) value)))

;; F128 stores of values in XMM registers.
(rule 1 (lower (store flags
                    value @ (value_type $F128)
                    address
                    offset))
      (side_effect
       (x64_movdqu_store (to_amode flags address offset) value)))

;; Stores of F32X4 vectors.
(rule 1 (lower (store flags
                    value @ (value_type $F32X4)
                    address
                    offset))
      (side_effect
       (x64_movups_store (to_amode flags address offset) value)))

;; Stores of F64X2 vectors.
(rule 1 (lower (store flags
                    value @ (value_type $F64X2)
                    address
                    offset))
      (side_effect
       (x64_movupd_store (to_amode flags address offset) value)))

;; Stores of all other 128-bit vector types with integer lanes.
(rule -1 (lower (store flags
                    value @ (value_type (ty_vec128_int _))
                    address
                    offset))
      (side_effect
       (x64_movdqu_store (to_amode flags address offset) value)))

;; Stores of I128 values: store the two 64-bit halves separately.
(rule 0 (lower (store flags
                    value @ (value_type $I128)
                    address
                    offset))
      (let ((value_reg ValueRegs value)
            (value_lo Gpr (value_regs_get_gpr value_reg 0))
            (value_hi Gpr (value_regs_get_gpr value_reg 1))
            (addr_lo Amode (to_amode flags address offset))
            (addr_hi Amode (amode_offset addr_lo 8)))
      (side_effect
       (side_effect_concat
        (x64_movrm $I64 addr_lo value_lo)
        (x64_movrm $I64 addr_hi value_hi)))))

;; Slightly optimize the extraction of the first lane from a vector which is
;; stored in memory. In the case the first lane specifically is selected the
;; standard `movss` and `movsd` instructions can be used as-if we're storing a
;; f32 or f64 despite the source perhaps being an integer vector since the
;; result of the instruction is the same.
(rule 2 (lower (store flags
                    (has_type $F32 (extractlane value (u8_from_uimm8 0)))
                    address
                    offset))
      (side_effect
       (x64_movss_store (to_amode flags address offset) value)))
(rule 2 (lower (store flags
                    (has_type $F64 (extractlane value (u8_from_uimm8 0)))
                    address
                    offset))
      (side_effect
       (x64_movsd_store (to_amode flags address offset) value)))
(rule 2 (lower (store flags
                    (has_type $I8 (extractlane value (u8_from_uimm8 n)))
                    address
                    offset))
      (if-let true (use_sse41))
      (side_effect
       (x64_pextrb_store (to_amode flags address offset) value n)))
(rule 2 (lower (store flags
                    (has_type $I16 (extractlane value (u8_from_uimm8 n)))
                    address
                    offset))
      (if-let true (use_sse41))
      (side_effect
       (x64_pextrw_store (to_amode flags address offset) value n)))
(rule 2 (lower (store flags
                    (has_type $I32 (extractlane value (u8_from_uimm8 n)))
                    address
                    offset))
      (if-let true (use_sse41))
      (side_effect
       (x64_pextrd_store (to_amode flags address offset) value n)))
(rule 2 (lower (store flags
                    (has_type $I64 (extractlane value (u8_from_uimm8 n)))
                    address
                    offset))
      (if-let true (use_sse41))
      (side_effect
       (x64_pextrq_store (to_amode flags address offset) value n)))

;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Add mem, reg
(rule store_x64_add_mem 3 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (iadd (and
                               (sinkable_load sink)
                               (load flags addr offset))
                              src2))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_add_mem ty (to_amode flags addr offset) src2))))

;; Add mem, reg with args swapped
(rule 2 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (iadd src2
                              (and
                               (sinkable_load sink)
                               (load flags addr offset))))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_add_mem ty (to_amode flags addr offset) src2))))

;; Sub mem, reg
(rule 2 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (isub (and
                               (sinkable_load sink)
                               (load flags addr offset))
                              src2))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_sub_mem ty (to_amode flags addr offset) src2))))

;; And mem, reg
(rule 3 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (band (and
                               (sinkable_load sink)
                               (load flags addr offset))
                              src2))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_and_mem ty (to_amode flags addr offset) src2))))

;; And mem, reg with args swapped
(rule 2 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (band src2
                              (and
                               (sinkable_load sink)
                               (load flags addr offset))))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_and_mem ty (to_amode flags addr offset) src2))))

;; Or mem, reg
(rule 3 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (bor (and
                               (sinkable_load sink)
                               (load flags addr offset))
                              src2))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_or_mem ty (to_amode flags addr offset) src2))))

;; Or mem, reg with args swapped
(rule 2 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (bor src2
                              (and
                               (sinkable_load sink)
                               (load flags addr offset))))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_or_mem ty (to_amode flags addr offset) src2))))

;; Xor mem, reg
(rule 3 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (bxor (and
                               (sinkable_load sink)
                               (load flags addr offset))
                              src2))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_xor_mem ty (to_amode flags addr offset) src2))))

;; Xor mem, reg with args swapped
(rule 2 (lower
       (store flags
              (has_type (ty_32_or_64 ty)
                        (bxor src2
                              (and
                               (sinkable_load sink)
                               (load flags addr offset))))
              addr
              offset))
      (let ((_ RegMemImm sink))
        (side_effect
         (x64_xor_mem ty (to_amode flags addr offset) src2))))

;; Rules for `fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (fence))
      (side_effect (x64_mfence)))

;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (func_addr (func_ref_data _ extname dist)))
      (load_ext_name extname 0 dist))

;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (symbol_value (symbol_value_data extname dist offset)))
      (load_ext_name extname offset dist))

;; Rules for `atomic_load` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; This is a normal load. The x86-TSO memory model provides sufficient
;; sequencing to satisfy the CLIF synchronisation requirements for `AtomicLoad`
;; without the need for any fence instructions.
;;
;; This lowering is only valid for I8, I16, I32, and I64. The sub-64-bit types
;; are zero extended, as with a normal load.
(rule 1 (lower (has_type $I64 (atomic_load flags address)))
      (x64_mov (to_amode flags address (zero_offset))))
(rule (lower (has_type (and (fits_in_32 ty) (ty_int _)) (atomic_load flags address)))
      (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address (zero_offset))))
;; Lower 128-bit `atomic_load` using `cmpxchg16b`.
(rule 1 (lower (has_type $I128 (atomic_load flags address)))
      (if-let true (use_cmpxchg16b))
      (x64_cmpxchg16b (value_regs (imm $I64 0) (imm $I64 0)) (value_regs (imm $I64 0) (imm $I64 0)) (to_amode flags address (zero_offset))))

;; Rules for `atomic_store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; This is a normal store followed by an `mfence` instruction. This lowering is
;; only valid for I8, I16, I32, and I64.
(rule (lower (atomic_store flags
                           value @ (value_type (and (fits_in_64 ty) (ty_int _)))
                           address))
      (side_effect (side_effect_concat
       (x64_movrm ty (to_amode flags address (zero_offset)) value)
       (x64_mfence))))
;; Lower 128-bit `atomic_store` using `cmpxchg16b`.
(rule 1 (lower (atomic_store flags value @ (value_type $I128) address))
      (if-let true (use_cmpxchg16b))
      (side_effect (x64_atomic_128_store_seq (to_amode flags address (zero_offset)) value)))

;; Rules for `atomic_cas` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
                  (atomic_cas flags address expected replacement)))
      (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset))))
(rule 1 (lower (has_type $I128 (atomic_cas flags address expected replacement)))
        (if-let true (use_cmpxchg16b))
        (x64_cmpxchg16b expected replacement (to_amode flags address (zero_offset))))

;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; This is a simple, general-case atomic update, based on a loop involving
;; `cmpxchg`.
(rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
                  (atomic_rmw flags op address input)))
      (x64_atomic_rmw_seq ty (atomic_rmw_seq_op op) (to_amode flags address (zero_offset)) input))

;; `Add` and `Sub` can use `lock xadd`
(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _))
                  (atomic_rmw flags (AtomicRmwOp.Add) address input)))
      (x64_xadd (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))
(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _))
                  (atomic_rmw flags (AtomicRmwOp.Sub) address input)))
      (x64_xadd (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) (x64_neg ty input)))
;; `Xchg` can use `xchg`
(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _))
                  (atomic_rmw flags (AtomicRmwOp.Xchg) address input)))
      (x64_xchg (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))

;; `Add`, `Sub`, `And`, `Or` and `Xor` can use `lock`-prefixed instructions if
;; the old value is not required.
(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
                  (atomic_rmw flags (AtomicRmwOp.Add) address input)))
      (if-let (first_result res) i)
      (if-let true (value_is_unused res))
      (x64_lock_add (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))
(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
                  (atomic_rmw flags (AtomicRmwOp.Sub) address input)))
      (if-let (first_result res) i)
      (if-let true (value_is_unused res))
      (x64_lock_sub (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))
(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
                  (atomic_rmw flags (AtomicRmwOp.And) address input)))
      (if-let (first_result res) i)
      (if-let true (value_is_unused res))
      (x64_lock_and (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))
(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
                  (atomic_rmw flags (AtomicRmwOp.Or) address input)))
      (if-let (first_result res) i)
      (if-let true (value_is_unused res))
      (x64_lock_or (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))
(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
                  (atomic_rmw flags (AtomicRmwOp.Xor) address input)))
      (if-let (first_result res) i)
      (if-let true (value_is_unused res))
      (x64_lock_xor (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))

;; 128-bit integers always use a `lock cmpxchg16b` loop.
(rule 3 (lower (has_type $I128 (atomic_rmw flags op address input)))
        (if-let true (use_cmpxchg16b))
        (x64_atomic_128_rmw_seq op (to_amode flags address (zero_offset)) input))

;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (call (func_ref_data sig_ref extname dist) inputs))
      (gen_call sig_ref extname dist inputs))

(rule (lower (call_indirect sig_ref val inputs))
      (gen_call_indirect sig_ref val inputs))

;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (return_call (func_ref_data sig_ref extname dist) args))
      (gen_return_call sig_ref extname dist args))

(rule (lower (return_call_indirect sig_ref callee args))
      (gen_return_call_indirect sig_ref callee args))

;; Rules for `stack_switch` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; currently, only the Basic model is supported
(rule (lower (stack_switch store_context_ptr load_context_ptr in_payload0))
      (if-let (StackSwitchModel.Basic) (stack_switch_model))
      (let ((store_context_ptr Gpr (put_in_gpr store_context_ptr))
            (load_context_ptr Gpr (put_in_gpr load_context_ptr))
            (in_payload0 Gpr (put_in_gpr in_payload0)))
        (x64_stack_switch_basic store_context_ptr load_context_ptr in_payload0)))

;;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;;

(rule (lower (get_frame_pointer))
      (x64_rbp))

(rule (lower (get_stack_pointer))
      (x64_rsp))

(rule (lower (get_return_address))
      (x64_load $I64
                (Amode.ImmReg 8 (x64_rbp) (mem_flags_trusted))
                (ExtKind.None)))

;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower_branch (jump _) (single_target target))
      (emit_side_effect (jmp_known target)))

;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 2 (lower_branch (brif (maybe_uextend (icmp cc a b)) _ _) (two_targets then else))
        (emit_side_effect (jmp_cond_icmp (emit_cmp cc a b) then else)))

(rule 2 (lower_branch (brif (maybe_uextend (fcmp cc a b)) _ _) (two_targets then else))
        (emit_side_effect (jmp_cond_fcmp (emit_fcmp cc a b) then else)))

(rule 2 (lower_branch (brif (maybe_uextend (vany_true a)) _ _) (two_targets then else))
        (emit_side_effect (jmp_cond_icmp (emit_vany_true a) then else)))

(rule 2 (lower_branch (brif (maybe_uextend (vall_true a)) _ _) (two_targets then else))
        (emit_side_effect (jmp_cond_icmp (emit_vall_true a) then else)))

(rule 1 (lower_branch (brif val @ (value_type $I128) _ _)
                      (two_targets then else))
      (emit_side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.Z) val) then else)))

(rule (lower_branch (brif val @ (value_type (ty_int_bool_or_ref)) _ _)
                    (two_targets then else))
      (emit_side_effect (with_flags_side_effect
                          (cmp_zero_int_bool_ref val)
                          (jmp_cond (CC.NZ) then else))))


;; Compare an I128 value to zero, returning a flags result suitable for making a
;; jump decision. The comparison is implemented as `(hi | low) == 0`,
;; and the result can be interpreted as follows
;; * CC.Z indicates that the value was non-zero, as one or both of the halves of
;;   the value were non-zero
;; * CC.NZ indicates that both halves of the value were 0
(decl cmp_zero_i128 (CC ValueRegs) IcmpCondResult)
(rule (cmp_zero_i128 (cc_nz_or_z cc) val)
      (let ((lo Gpr (value_regs_get_gpr val 0))
            (hi Gpr (value_regs_get_gpr val 1)))
          (icmp_cond_result
            (x64_alurmi_flags_side_effect (AluRmiROpcode.Or) $I64 lo hi)
            (cc_invert cc))))


(decl cmp_zero_int_bool_ref (Value) ProducesFlags)
(rule (cmp_zero_int_bool_ref val @ (value_type ty))
      (let ((size OperandSize (raw_operand_size_of_type ty))
            (src Gpr val))
        (x64_test size src src)))

;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower_branch (br_table idx @ (value_type ty) _) (jump_table_targets default_target jt_targets))
      (let ((size OperandSize (raw_operand_size_of_type ty))
            (jt_size u32 (jump_table_size jt_targets))
            (size_reg Reg (imm ty (u32_as_u64 jt_size)))
            (idx_reg Gpr (extend_to_gpr idx $I64 (ExtendKind.Zero)))
            (clamped_idx Reg (with_flags_reg
              (x64_cmp size idx_reg size_reg)
              (cmove ty (CC.B) idx_reg size_reg))))
      (emit_side_effect (jmp_table_seq ty clamped_idx default_target jt_targets))))

;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (select_spectre_guard (icmp cc a b) x y))
      (select_icmp (emit_cmp cc a b) x y))

(rule -1 (lower (has_type ty (select_spectre_guard c @ (value_type (fits_in_64 a_ty)) x y)))
      (let ((size OperandSize (raw_operand_size_of_type a_ty))
            (gpr_c Gpr (put_in_gpr c)))
        (with_flags (x64_test size gpr_c gpr_c) (cmove_from_values ty (CC.NZ) x y))))

(rule -2 (lower (has_type ty (select_spectre_guard c @ (value_type $I128) x y)))
      (let ((cond_result IcmpCondResult (cmp_zero_i128 (CC.Z) c)))
        (select_icmp cond_result x y)))

;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float
;; conversion instruction in isolation, it also takes the upper 64-bits of an
;; xmm register and places it into the destination. We don't actually want that
;; to happen as it could accidentally create a false dependency with a
;; previous instruction defining the register's upper 64-bits. See #7085 for
;; an instance of this.
;;
;; This means that the first operand to all of the int-to-float conversions here
;; are `(xmm_zero)` operands which is a guaranteed zero register that has no
;; dependencies on other instructions.
;;
;; Ideally this would be lifted out to a higher level to get deduplicated
;; between consecutive int-to-float operations but that's not easy
;; to do at this time. One possibility would be a mid-end rule which rewrites
;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would
;; be subject to normal LICM, but that's not feasible today.

(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8))))
      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16))))
      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
      (x64_cvtsi2ss ty (xmm_zero $F32X4) a))

(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8))))
      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16))))
      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))

(rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
      (x64_cvtsi2sd ty (xmm_zero $F64X2) a))

(rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4)))
      (x64_cvtdq2ps a))

;; Base case: decompose the i64x2 input into two scalar registers and convert
;; each of those into a float. Afterwards re-pack the two results into the final
;; destination.
(rule 0 (lower (fcvt_from_sint a @ (value_type $I64X2)))
      (let (
          (a Xmm a)
          (zero Xmm (xmm_zero $F64X2))
          (f0 Xmm (x64_cvtsi2sd $I64 zero (x64_movq_to_gpr a)))
          (f1 Xmm (x64_cvtsi2sd $I64 zero (x64_movq_to_gpr (x64_pshufd a 0b11_10_11_10))))
        )
        (x64_unpcklpd f0 f1)))

(rule 1 (lower (has_type $F64X2 (fcvt_from_sint (swiden_low a @ (value_type $I32X4)))))
      (x64_cvtdq2pd a))

;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
      (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero))))

(rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
      (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero))))

(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
      (cvt_u64_to_float_seq ty val))

;; Base case of u64x2 being converted to f64x2. No native instruction for this
;; is available so it's emulated through a series of instructions that exploit
;; the binary representation of 64-bit floats. This sequence of instructions is
;; copied from LLVM and my understanding of the general idea is to roughly:
;;
;; * For each bullet below operate in parallel on the left and right lanes.
;; * Move the low 32 bits of the input into one register and the upper
;;   32-bits into a different register, where both have all 0s for the upper
;;   32-bits. (e.g. split the 64-bit input into two locations)
;; * For the low bits, create `1.<twenty-zeros><low32>p52` via bit tricks.
;; * For the high bits, create `1.<twenty-zeros><high32>p84` via bit tricks.
;; * Create the constant `1.0p84 + 1.0p52`
;; * Add the two high halves and subtract the constant.
;;
;; Apply some math and this should produce the same result as the native
;; conversion.
;;
;; As for the bit tricks a float is represented where the low 53 bits are the
;; decimal of the float, basically:
;;
;;  f = 1.<fraction> ^ (<exponent> - 1023)
;;
;; where `<fraction>` is the low 53 bits. By placing the 32-bit halves from
;; the original integer into the low 53 bits and setting the exponent right it
;; means that each 32-bit half can become part of a 64-bit floating point
;; number. The final step in combining via float arithmetic will chop off the
;; leading `1.` at the start of the float that we constructed, one for the low
;; half and one for the upper half.
(rule -1 (lower (has_type $F64X2 (fcvt_from_uint val @ (value_type $I64X2))))
  (let ((low32_mask XmmMem (emit_u128_le_const 0x00000000ffffffff_00000000ffffffff))
        (float_1p52 XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000))
        (float_1p84 XmmMem (emit_u128_le_const 0x4530000000000000_4530000000000000))
        (float_1p84_plus_1p52 XmmMem (emit_u128_le_const 0x4530000000100000_4530000000100000))
        (low32 Xmm (x64_pand val low32_mask))
        (low32_as_float Xmm (x64_por low32 float_1p52))
        (high32 Xmm (x64_psrlq val (xmi_imm 32)))
        (high32_as_float Xmm (x64_por high32 float_1p84)))
    (x64_addpd low32_as_float (x64_subpd high32_as_float float_1p84_plus_1p52))))

;; Algorithm uses unpcklps to help create a float that is equivalent
;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
;; every value of the mantissa represents a corresponding uint32 number.
;; When we subtract 0x1.0p52 we are left with double(src).
(rule 1 (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
      (let ((uint_mask XmmMem (emit_u128_le_const 0x43300000_43300000))
            (res Xmm (x64_unpcklps val uint_mask))
            (uint_mask_high XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000)))
        (x64_subpd res uint_mask_high)))

;; When AVX512VL and AVX512F are available,
;; `fcvt_from_uint` can be lowered to a single instruction.
(rule 2 (lower (has_type $F32X4 (fcvt_from_uint src)))
      (if-let true (use_avx512vl))
      (if-let true (use_avx512f))
      (x64_vcvtudq2ps src))

;; Converting packed unsigned integers to packed floats
;; requires a few steps. There is no single instruction
;; lowering for converting unsigned floats but there is for
;; converting packed signed integers to float (cvtdq2ps). In
;; the steps below we isolate the upper half (16 bits) and
;; lower half (16 bits) of each lane and then we convert
;; each half separately using cvtdq2ps meant for signed
;; integers. In order for this to work for the upper half
;; bits we must shift right by 1 (divide by 2) these bits in
;; order to ensure the most significant bit is 0 not signed,
;; and then after the conversion we double the value.
;; Finally we add the converted values where addition will
;; correctly round.
;;
;; Sequence:
;; -> A = 0xffffffff
;; -> Ah = 0xffff0000
;; -> Al = 0x0000ffff
;; -> Convert(Al) // Convert int to float
;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
;; -> dst = Ah + Al // Add the two floats together
(rule 1 (lower (has_type $F32X4 (fcvt_from_uint val)))
      (let ((a Xmm val)

            ;;  get the low 16 bits
            (a_lo Xmm (x64_pslld a (xmi_imm 16)))
            (a_lo Xmm (x64_psrld a_lo (xmi_imm 16)))

            ;; get the high 16 bits
            (a_hi Xmm (x64_psubd a a_lo))

            ;; convert the low 16 bits
            (a_lo Xmm (x64_cvtdq2ps a_lo))

            ;; shift the high bits by 1, convert, and double to get the correct
            ;; value
            (a_hi Xmm (x64_psrld a_hi (xmi_imm 1)))
            (a_hi Xmm (x64_cvtdq2ps a_hi))
            (a_hi Xmm (x64_addps a_hi a_hi)))

        ;; add together the two converted values
        (x64_addps a_hi a_lo)))

;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _)))))
      (cvt_float_to_uint_seq out_ty val false))

(rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _)))))
      (cvt_float_to_uint_seq out_ty val true))

(rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _)))))
      (cvt_float_to_sint_seq out_ty val false))

(rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _)))))
      (cvt_float_to_sint_seq out_ty val true))

;; The x64 backend currently only supports these two type combinations.
(rule 1 (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4))))
      (let ((src Xmm val)

            ;; Sets tmp to zero if float is NaN
            (tmp Xmm (x64_cmpps src src (FcmpImm.Equal)))
            (dst Xmm (x64_andps src tmp))

            ;; Sets top bit of tmp if float is positive
            ;; Setting up to set top bit on negative float values
            (tmp Xmm (x64_pxor tmp dst))

            ;; Convert the packed float to packed doubleword.
            (dst Xmm (x64_cvttps2dq dst))

            ;; Set top bit only if < 0
            (tmp Xmm (x64_pand dst tmp))
            (tmp Xmm (x64_psrad tmp (xmi_imm 31))))

        ;; On overflow 0x80000000 is returned to a lane.
        ;; Below sets positive overflow lanes to 0x7FFFFFFF
        ;; Keeps negative overflow lanes as is.
        (x64_pxor tmp dst)))

;; The algorithm for converting floats to unsigned ints is a little tricky. The
;; complication arises because we are converting from a signed 64-bit int with a positive
;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
;; precisely INT_MAX values we can correctly account for and convert every value in this range
;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
;; After the conversion we add INT_MAX+1 back to this converted value, noting again that
;; values we are trying to account for were already set to INT_MAX+1 during the original conversion.
;; We simply have to create a mask and make sure we are adding together only the lanes that need
;; to be accounted for. Digesting it all the steps then are:
;;
;; Step 1 - Account for NaN and negative floats by setting these src values to zero.
;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
;;          reasons described above.
;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
;;          values that were originally in the range (0..INT_MAX). This will come in handy during
;;          step 7 when we zero negative lanes.
;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
;;          UINT_MAX that are now less than INT_MAX thanks to the subtraction.
;; Step 6 - Convert the second set of values (tmp1)
;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
;;          converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
;;          as this will allow us to properly saturate overflow lanes when adding to 0x80000000
;; Step 8 - Add the original converted src and the converted tmp1 where float values originally less
;;          than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
;;          UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
;;          greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
;;
;;
;; The table below illustrates the result after each step where it matters for the converted set.
;; Note the original value range (original src set) is the final dst in Step 8:
;;
;; Original src set:
;; | Original Value Range |    Step 1    |         Step 3         |          Step 8           |
;; |  -FLT_MIN..FLT_MAX   | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
;;
;; Copied src set (tmp1):
;; |    Step 2    |                  Step 4                  |
;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
;;
;; |                       Step 6                        |                 Step 7                 |
;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
(rule 1 (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4))))
      (let ((src Xmm val)

            ;; Converting to unsigned int so if float src is negative or NaN
            ;; will first set to zero.
            (tmp2 Xmm (xmm_zero $F32X4))
            (dst Xmm (x64_maxps src tmp2))

            ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
            ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
            ;; single precision IEEE-754 floats can only accurately represent contiguous
            ;; integers up to 2^23 and outside of this range it rounds to the closest
            ;; integer that it can represent. In the case of INT_MAX, this value gets
            ;; represented as 0x4f000000 which is the integer value (INT_MAX+1).
            (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2))
            (tmp2 Xmm (x64_psrld tmp2 (xmi_imm 1)))
            (tmp2 Xmm (x64_cvtdq2ps tmp2))

            ;; Make a copy of these lanes and then do the first conversion.
            ;; Overflow lanes greater than the maximum allowed signed value will
            ;; set to 0x80000000. Negative and NaN lanes will be 0x0
            (tmp1 Xmm dst)
            (dst Xmm (x64_cvttps2dq dst))

            ;; Set lanes to src - max_signed_int
            (tmp1 Xmm (x64_subps tmp1 tmp2))

            ;; Create mask for all positive lanes to saturate (i.e. greater than
            ;; or equal to the maximum allowable unsigned int).
            (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual)))

            ;; Convert those set of lanes that have the max_signed_int factored out.
            (tmp1 Xmm (x64_cvttps2dq tmp1))

            ;; Prepare converted lanes by zeroing negative lanes and prepping lanes
            ;; that have positive overflow (based on the mask) by setting these lanes
            ;; to 0x7FFFFFFF
            (tmp1 Xmm (x64_pxor tmp1 tmp2))
            (tmp2 Xmm (xmm_zero $I32X4))
            (tmp1 Xmm (lower_vec_smax $I32X4 tmp1 tmp2)))

        ;; Add this second set of converted lanes to the original to properly handle
        ;; values greater than max signed int.
        (x64_paddd tmp1 dst)))

;; Rules for `x86_cvtt2dq` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I32X4 (x86_cvtt2dq val @ (value_type $F32X4))))
      (x64_cvttps2dq val))

;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8X16 (iadd_pairwise x y)))
      (let (
          ;; Shuffle all the even lanes of `x` and `y` into one register
          (even_lane_mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00ff_00ff_00ff_00ff_00ff_00ff_00ff_00ff)))
          (x_evens Xmm (x64_pand x even_lane_mask))
          (y_evens Xmm (x64_pand y even_lane_mask))
          (evens Xmm (x64_packuswb x_evens y_evens))

          ;; Shuffle all the odd lanes of `x` and `y` into one register
          (x_odds Xmm (x64_psrlw x (xmi_imm 8)))
          (y_odds Xmm (x64_psrlw y (xmi_imm 8)))
          (odds Xmm (x64_packuswb x_odds y_odds))
        )
        (x64_paddb evens odds)))


(rule 1 (lower (has_type $I16X8 (iadd_pairwise x y)))
        (if-let true (use_ssse3))
        (x64_phaddw x y))

(rule (lower (has_type $I16X8 (iadd_pairwise x y)))
      (let (
          (x Xmm x)
          (y Xmm y)

          ;; Shuffle the even-numbered 16-bit lanes into low four lanes of each
          ;; vector by shuffling 16-bit lanes then shuffling 32-bit lanes.
          ;; With these in place generate a new vector from the two low 64-bits
          ;; of each vector (the low four 16-bit lanes).
          ;;
          ;; 0xe8 == 0b11_10_10_00
          (x_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw x 0xe8) 0xe8) 0xe8))
          (y_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw y 0xe8) 0xe8) 0xe8))
          (evens Xmm (x64_punpcklqdq x_evens y_evens))

          ;; Shuffle the odd-numbered 16-bit lanes into the low 8 lanes by
          ;; performing `sshr` operation on 32-bit lanes, effectively moving the
          ;; odd lanes into even lanes while leaving their sign bits in the
          ;; odd lanes. The `packssdw` instruction then conveniently will
          ;; put everything into one vector for us.
          (x_shifted Xmm (x64_psrad x (xmi_imm 16)))
          (y_shifted Xmm (x64_psrad y (xmi_imm 16)))
          (odds Xmm (x64_packssdw x_shifted y_shifted))
        )
      (x64_paddw evens odds)))

(rule 1 (lower (has_type $I32X4 (iadd_pairwise x y)))
        (if-let true (use_ssse3))
        (x64_phaddd x y))

(rule (lower (has_type $I32X4 (iadd_pairwise x y)))
      (let (
          (x Xmm x)
          (y Xmm y)
          ;; evens = [ x[0] x[2] y[0] y[2] ]
          (evens Xmm (x64_shufps x y 0b10_00_10_00))
          ;; odds  = [ x[1] x[3] y[1] y[3] ]
          (odds  Xmm (x64_shufps x y 0b11_01_11_01))
        )
      (x64_paddd evens odds)))

;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
(rule 2 (lower
        (has_type $I16X8 (iadd_pairwise
                           (swiden_low val @ (value_type $I8X16))
                           (swiden_high val))))
      (if-let true (use_ssse3))
      (let ((mul_const Xmm (x64_xmm_load_const $I8X16
              (emit_u128_le_const 0x01010101010101010101010101010101))))
        (x64_pmaddubsw mul_const val)))

;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
(rule 2 (lower
        (has_type $I32X4 (iadd_pairwise
                           (swiden_low val @ (value_type $I16X8))
                           (swiden_high val))))
      (let ((mul_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001)))
        (x64_pmaddwd val mul_const)))

;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
(rule 2 (lower
        (has_type $I16X8 (iadd_pairwise
                           (uwiden_low val @ (value_type $I8X16))
                           (uwiden_high val))))
      (if-let true (use_ssse3))
      (let ((mul_const XmmMem (emit_u128_le_const 0x01010101010101010101010101010101)))
        (x64_pmaddubsw val mul_const)))

;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
(rule 2 (lower
        (has_type $I32X4 (iadd_pairwise
                           (uwiden_low val @ (value_type $I16X8))
                           (uwiden_high val))))
      (let ((xor_const XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000))
            (dst Xmm (x64_pxor val xor_const))

            (madd_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001))
            (dst Xmm (x64_pmaddwd dst madd_const))

            (addd_const XmmMem (emit_u128_le_const 0x00010000_00010000_00010000_00010000)))
        (x64_paddd dst addd_const)))

;; special case for the `i32x4.dot_i16x8_s` wasm instruction
(rule 2 (lower
        (has_type $I32X4 (iadd_pairwise
                           (imul (swiden_low x) (swiden_low y))
                           (imul (swiden_high x) (swiden_high y)))))
      (x64_pmaddwd x y))

;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; With SSE4.1 use the `pmovsx*` instructions for this
(rule 1 (lower (has_type $I16X8 (swiden_low val @ (value_type $I8X16))))
        (if-let true (use_sse41))
        (x64_pmovsxbw val))
(rule 1 (lower (has_type $I32X4 (swiden_low val @ (value_type $I16X8))))
        (if-let true (use_sse41))
        (x64_pmovsxwd val))
(rule 1 (lower (has_type $I64X2 (swiden_low val @ (value_type $I32X4))))
        (if-let true (use_sse41))
        (x64_pmovsxdq val))

(rule (lower (has_type ty (swiden_low val))) (lower_swiden_low ty val))

(decl lower_swiden_low (Type Xmm) Xmm)

;; Duplicate the low lanes next to each other, then perform a wider shift-right
;; by the low lane width to move the upper of each pair back into the lower lane
;; of each pair, achieving the widening of the lower lanes.
(rule (lower_swiden_low $I16X8 val)
      (x64_psraw (x64_punpcklbw val val) (xmi_imm 8)))
(rule (lower_swiden_low $I32X4 val)
      (x64_psrad (x64_punpcklwd val val) (xmi_imm 16)))

;; Generate the sign-extended halves with a `val < 0` comparison (expressed
;; reversed here), then interleave the low 32-bit halves to create the full
;; 64-bit results.
(rule (lower_swiden_low $I64X2 val)
      (let ((tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val)))
      (x64_punpckldq val tmp)))

;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Similar to `swiden_low` with SSE4.1 except that the upper lanes are moved
;; to the lower lanes first.
(rule 1 (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
        (if-let true (use_sse41))
        (if-let true (use_ssse3))
        (let ((x Xmm val))
          (x64_pmovsxbw (x64_palignr x x 8))))
(rule 1 (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
        (if-let true (use_sse41))
        (if-let true (use_ssse3))
        (let ((x Xmm val))
          (x64_pmovsxwd (x64_palignr x x 8))))
(rule 1 (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
        (if-let true (use_sse41))
        (x64_pmovsxdq (x64_pshufd val 0b11_10_11_10)))

;; Similar to `swiden_low` versions but using `punpckh*` instructions to
;; pair the high lanes next to each other.
(rule (lower (has_type $I16X8 (swiden_high val @ (value_type $I8X16))))
      (let ((val Xmm val))
        (x64_psraw (x64_punpckhbw val val) (xmi_imm 8))))
(rule (lower (has_type $I32X4 (swiden_high val @ (value_type $I16X8))))
      (let ((val Xmm val))
        (x64_psrad (x64_punpckhwd val val) (xmi_imm 16))))

;; Same as `swiden_low`, but `val` has its high lanes moved down.
(rule (lower (has_type $I64X2 (swiden_high val @ (value_type $I32X4))))
      (let ((val Xmm (x64_pshufd val 0b00_00_11_10))
            (tmp Xmm (x64_pcmpgtd (xmm_zero $I32X4) val)))
      (x64_punpckldq val tmp)))

;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; With SSE4.1 use the `pmovzx*` instructions for this
(rule 1 (lower (has_type $I16X8 (uwiden_low val @ (value_type $I8X16))))
        (if-let true (use_sse41))
        (x64_pmovzxbw val))
(rule 1 (lower (has_type $I32X4 (uwiden_low val @ (value_type $I16X8))))
        (if-let true (use_sse41))
        (x64_pmovzxwd val))
(rule 1 (lower (has_type $I64X2 (uwiden_low val @ (value_type $I32X4))))
        (if-let true (use_sse41))
        (x64_pmovzxdq val))

(rule (lower (has_type ty (uwiden_low val))) (lower_uwiden_low ty val))

;; Interleave an all-zero register with the low lanes to produce zero-extended
;; results.
(decl lower_uwiden_low (Type Xmm) Xmm)
(rule (lower_uwiden_low $I16X8 val) (x64_punpcklbw val (xmm_zero $I8X16)))
(rule (lower_uwiden_low $I32X4 val) (x64_punpcklwd val (xmm_zero $I8X16)))
(rule (lower_uwiden_low $I64X2 val) (x64_unpcklps val (xmm_zero $F32X4)))

;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Same as `uwiden_high`, but interleaving high lanes instead.
;;
;; Note that according to `llvm-mca` at least these instructions are faster
;; than using `pmovzx*` in terms of cycles, even if SSE4.1 is available.
(rule (lower (has_type $I16X8 (uwiden_high val @ (value_type $I8X16))))
      (x64_punpckhbw val (xmm_zero $I8X16)))
(rule (lower (has_type $I32X4 (uwiden_high val @ (value_type $I16X8))))
      (x64_punpckhwd val (xmm_zero $I8X16)))
(rule (lower (has_type $I64X2 (uwiden_high val @ (value_type $I32X4))))
      (x64_unpckhps val (xmm_zero $F32X4)))

;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8X16 (snarrow a @ (value_type $I16X8) b)))
      (x64_packsswb a b))

(rule (lower (has_type $I16X8 (snarrow a @ (value_type $I32X4) b)))
      (x64_packssdw a b))

;; We're missing a `snarrow` case for $I64X2
;; https://github.com/bytecodealliance/wasmtime/issues/4734

;; This rule is a special case for handling the translation of the wasm op
;; `i32x4.trunc_sat_f64x2_s_zero`. It can be removed once we have an
;; implementation of `snarrow` for `I64X2`.
(rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (fcvt_to_sint_sat val))
                                       (vconst (u128_from_constant 0)))))
      (let ((a Xmm val)

            ;; y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to:
            ;; MOVE xmm_tmp, xmm_x
            ;; CMPEQPD xmm_tmp, xmm_x
            ;; MOVE xmm_y, xmm_x
            ;; ANDPS xmm_tmp, [wasm_f64x2_splat(2147483647.0)]
            ;; MINPD xmm_y, xmm_tmp
            ;; CVTTPD2DQ xmm_y, xmm_y

            (tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal)))

            ;; 2147483647.0 is equivalent to 0x41DFFFFFFFC00000
            (umax_mask XmmMem (emit_u128_le_const 0x41DFFFFFFFC00000_41DFFFFFFFC00000))

            ;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)]
            (tmp1 Xmm (x64_andps tmp1 umax_mask))
            (dst Xmm (x64_minpd a tmp1)))
        (x64_cvttpd2dq dst)))

;; This rule is a special case for handling the translation of the wasm op
;; `i32x4.relaxed_trunc_f64x2_s_zero`.
(rule (lower (has_type $I32X4 (snarrow (has_type $I64X2 (x86_cvtt2dq val))
                                       (vconst (u128_from_constant 0)))))
        (x64_cvttpd2dq val))

;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8X16 (unarrow a @ (value_type $I16X8) b)))
      (x64_packuswb a b))

(rule 1 (lower (has_type $I16X8 (unarrow a @ (value_type $I32X4) b)))
        (if-let true (use_sse41))
        (x64_packusdw a b))

;; For each input `a` and `b` take the four 32-bit lanes and compress them to
;; the low 64-bits of the vector as four 16-bit lanes. Then these are woven
;; into one final vector with a `punpcklqdq`.
;;
;; If this is performance sensitive then it's probably best to upgrade the CPU
;; to get the above single-instruction lowering.
(rule (lower (has_type $I16X8 (unarrow a @ (value_type $I32X4) b)))
      (let (
          (a Xmm (unarrow_i32x4_lanes_to_low_u16_lanes a))
          (b Xmm (unarrow_i32x4_lanes_to_low_u16_lanes b))
        )
        (x64_punpcklqdq a b)))

(decl unarrow_i32x4_lanes_to_low_u16_lanes (Xmm) Xmm)
(rule (unarrow_i32x4_lanes_to_low_u16_lanes val)
      (let (
          ;; First convert all negative values in `val` to zero lanes.
          (val_gt_zero Xmm (x64_pcmpgtd val (xmm_zero $I32X4)))
          (val Xmm (x64_pand val val_gt_zero))

          ;; Next clamp all larger-than-u16-max lanes to u16::MAX.
          (max Xmm (x64_movdqu_load (emit_u128_le_const 0x0000ffff_0000ffff_0000ffff_0000ffff)))
          (cmp Xmm (x64_pcmpgtd max val))
          (valid_lanes Xmm (x64_pand val cmp))
          (clamped_lanes Xmm (x64_pandn cmp max))
          (val Xmm (x64_por valid_lanes clamped_lanes))

          ;; Within each 64-bit half of the 32x4 vector move the first 16 bits
          ;; and the third 16 bits to the bottom of the half. Afterwards
          ;; for the 32x4 vector move the first and third lanes to the bottom
          ;; lanes, which finishes up the conversion here as all the lanes
          ;; are now converted to 16-bit values in the low 4 lanes.
          (val Xmm (x64_pshuflw val 0b00_00_10_00))
          (val Xmm (x64_pshufhw val 0b00_00_10_00))
        )
        (x64_pshufd val 0b00_00_10_00)))


;; We're missing a `unarrow` case for $I64X2
;; https://github.com/bytecodealliance/wasmtime/issues/4734

;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -3 (lower (has_type (is_gpr_type (fits_in_64 ty)) (bitcast _ src @ (value_type (is_xmm_type _)))))
      (bitcast_xmm_to_gpr (ty_bits ty) src))

(rule -2 (lower (has_type (is_xmm_type (fits_in_64 ty)) (bitcast _ src @ (value_type (is_gpr_type _)))))
      (bitcast_gpr_to_xmm (ty_bits ty) src))

(rule -1 (lower (has_type $I128 (bitcast _ src @ (value_type (is_xmm_type _)))))
      (bitcast_xmm_to_gprs src))

(rule 0 (lower (has_type (is_xmm_type _) (bitcast _ src @ (value_type $I128))))
      (bitcast_gprs_to_xmm src))

;; Bitcast between types residing in GPR registers is a no-op.
(rule 1 (lower (has_type (is_gpr_type _)
                         (bitcast _ x @ (value_type (is_gpr_type _)))))
      x)

;; Bitcast between types residing in XMM registers is a no-op.
(rule 3 (lower (has_type (is_xmm_type _)
                         (bitcast _ x @ (value_type (is_xmm_type _)))))
      x)

;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fcopysign a @ (value_type $F32) b)))
      (let ((sign_bit Xmm (imm $F32 0x80000000)))
        (x64_orps
          (x64_andnps sign_bit a)
          (x64_andps sign_bit b))))

(rule (lower (has_type $F64 (fcopysign a @ (value_type $F64) b)))
      (let ((sign_bit Xmm (imm $F64 0x8000000000000000)))
        (x64_orpd
          (x64_andnpd sign_bit a)
          (x64_andpd sign_bit b))))

;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;;

;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates
;; the appropriate libcall and sequence to call that.
(decl x64_round (Type RegMem RoundImm) Xmm)
(rule 1 (x64_round $F32 a imm)
        (if-let true (use_sse41))
        (x64_roundss a imm))
(rule 1 (x64_round $F64 a imm)
        (if-let true (use_sse41))
        (x64_roundsd a imm))
(rule 1 (x64_round $F32X4 a imm)
        (if-let true (use_sse41))
        (x64_roundps a imm))
(rule 1 (x64_round $F64X2 a imm)
        (if-let true (use_sse41))
        (x64_roundpd a imm))

(rule (x64_round $F32 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F32 imm) a))
(rule (x64_round $F64 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F64 imm) a))
(rule (x64_round $F32X4 (RegMem.Reg a) imm)
      (let (
          (libcall LibCall (round_libcall $F32 imm))
          (result Xmm (libcall_1 libcall a))
          (a1 Xmm (libcall_1 libcall (x64_pshufd a 1)))
          (result Xmm (f32x4_insertlane result a1 1))
          (a2 Xmm (libcall_1 libcall (x64_pshufd a 2)))
          (result Xmm (f32x4_insertlane result a2 2))
          (a3 Xmm (libcall_1 libcall (x64_pshufd a 3)))
          (result Xmm (f32x4_insertlane result a3 3))
        )
        result))
(rule (x64_round $F64X2 (RegMem.Reg a) imm)
      (let (
          (libcall LibCall (round_libcall $F64 imm))
          (result Xmm (libcall_1 libcall a))
          (a1 Xmm (libcall_1 libcall (x64_pshufd a 0b00_00_11_10)))
        )
        (x64_movlhps result a1)))
(rule (x64_round ty (RegMem.Mem addr) imm)
      (x64_round ty (RegMem.Reg (x64_load ty addr (ExtKind.ZeroExtend))) imm))

(decl round_libcall (Type RoundImm) LibCall)
(rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32))
(rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64))
(rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32))
(rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64))
(rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32))
(rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64))
(rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32))
(rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64))

;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (ceil a @ (value_type ty)))
      (x64_round ty a (RoundImm.RoundUp)))

;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (floor a @ (value_type ty)))
      (x64_round ty a (RoundImm.RoundDown)))

;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (nearest a @ (value_type ty)))
      (x64_round ty a (RoundImm.RoundNearest)))

;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (trunc a @ (value_type ty)))
      (x64_round ty a (RoundImm.RoundZero)))

;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (stack_addr stack_slot offset))
      (stack_addr_impl stack_slot offset))

;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; NB: a `RegMem` divisor, while allowed in the instruction encoding, isn't
;; used right now to prevent a possibly-trapping load getting folded into the
;; `div` instruction. Ideally non-trapping loads would get folded, however, or
;; alternatively Wasmtime/Cranelift would grow support for multiple traps on
;; a single opcode and the signal kind would differentiate at runtime.

;; The inputs to the `div` instruction are different for 8-bit division so
;; it needs a special case here since the instruction being crafted has a
;; different shape.
(rule 2 (lower (udiv a @ (value_type $I8) b))
        (x64_div8 (extend_to_gpr a $I32 (ExtendKind.Zero))
                  (put_in_gpr b)
                  (DivSignedness.Unsigned)
                  (TrapCode.INTEGER_DIVISION_BY_ZERO)))

;; 16-to-64-bit division is all done with a similar instruction and the only
;; tricky requirement here is that when div traps are disallowed the divisor
;; must not be zero.
(rule 1 (lower (udiv a @ (value_type (fits_in_64 ty)) b))
        (x64_div_quotient a
                          (imm $I64 0)
                          (put_in_gpr b)
                          (raw_operand_size_of_type ty)
                          (DivSignedness.Unsigned)
                          (TrapCode.INTEGER_DIVISION_BY_ZERO)))

;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 2 (lower (sdiv a @ (value_type $I8) b))
        (x64_div8 (x64_sign_extend_data a (OperandSize.Size8))
                  (nonzero_sdiv_divisor $I8 b)
                  (DivSignedness.Signed)
                  (TrapCode.INTEGER_OVERFLOW)))

(rule 1 (lower (sdiv a @ (value_type (fits_in_64 ty)) b))
        (let (
            (a Gpr a)
            (size OperandSize (raw_operand_size_of_type ty))
          )
        (x64_div_quotient a
                          (x64_sign_extend_data a size)
                          (nonzero_sdiv_divisor ty b)
                          size
                          (DivSignedness.Signed)
                          (TrapCode.INTEGER_OVERFLOW))))

;; Checks to make sure that the input `Value` is a non-zero value for `sdiv`.
;;
;; This is required to differentiate the divide-by-zero trap from the
;; integer-overflow trap, the two trapping conditions of signed division.
(decl nonzero_sdiv_divisor (Type Value) Reg)
(rule 1 (nonzero_sdiv_divisor ty (iconst imm))
        (if-let n (safe_divisor_from_imm64 ty imm))
        (imm ty n))
(rule 0 (nonzero_sdiv_divisor ty val)
      (let (
          (val Reg val)
          (_ InstOutput (side_effect (with_flags_side_effect
            (x64_test (raw_operand_size_of_type ty) val val)
            (trap_if (CC.Z) (TrapCode.INTEGER_DIVISION_BY_ZERO)))))
        )
        val))

;; Rules for `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; The remainder is in AH, so take the result of the division and right-shift
;; by 8.
(rule 2 (lower (urem a @ (value_type $I8) b))
        (let (
            (result Gpr (x64_div8 (extend_to_gpr a $I32 (ExtendKind.Zero))
                                  (put_in_gpr b) ;; see `udiv` for why not `gpr_mem`
                                  (DivSignedness.Unsigned)
                                  (TrapCode.INTEGER_DIVISION_BY_ZERO)))
          )
          (x64_shr $I64 result (Imm8Reg.Imm8 8))))

(rule 1 (lower (urem a @ (value_type (fits_in_64 ty)) b))
        (x64_div_remainder a
                           (imm $I64 0)
                           (put_in_gpr b) ;; see `udiv` for why not `gpr_mem`
                           (raw_operand_size_of_type ty)
                           (DivSignedness.Unsigned)
                           (TrapCode.INTEGER_DIVISION_BY_ZERO)))

;; Rules for `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Special-cases first for constant `srem` where the checks for 0 and -1 aren't
;; applicable.
;;
;; Note that like `urem` for i8 types the result is in AH so to get the result
;; it's right-shifted down.
(rule 3 (lower (srem a @ (value_type $I8) (iconst imm)))
        (if-let n (safe_divisor_from_imm64 $I8 imm))
        (let (
            (a Gpr (x64_sign_extend_data a (OperandSize.Size8)))
            (result Gpr (x64_div8 a (imm $I8 n) (DivSignedness.Signed) (TrapCode.INTEGER_DIVISION_BY_ZERO)))
          )
          (x64_shr $I64 result (Imm8Reg.Imm8 8))))

;; Same as the above rule but for 16-to-64 bit types.
(rule 2 (lower (srem a @ (value_type ty) (iconst imm)))
        (if-let n (safe_divisor_from_imm64 ty imm))
        (let (
            (a Gpr a)
            (size OperandSize (raw_operand_size_of_type ty))
          )
          (x64_div_remainder a
                             (x64_sign_extend_data a size)
                             (imm ty n)
                             size
                             (DivSignedness.Signed)
                             (TrapCode.INTEGER_DIVISION_BY_ZERO))))

(rule 1 (lower (srem a @ (value_type $I8) b))
        (let (
            (a Gpr (x64_sign_extend_data a (OperandSize.Size8)))
          )
          (x64_shr $I64 (x64_checked_srem_seq8 a b) (Imm8Reg.Imm8 8))))

(rule (lower (srem a @ (value_type ty) b))
      (let (
          (a Gpr a)
          (size OperandSize (raw_operand_size_of_type ty))
          (hi Gpr (x64_sign_extend_data a size))
          (tmp ValueRegs (x64_checked_srem_seq size a hi b))
        )
        (value_regs_get tmp 1)))

;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (umulhi a @ (value_type $I8) b))
        (x64_shr $I16 (x64_mul8 false a b) (imm8_to_imm8_gpr 8)))

(rule 1 (lower (umulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b))
        (value_regs_get_gpr (x64_mul ty false a b) 1))

;; The BMI2 instruction set introduced `mulx` which defines two registers but
;; if the two registers are the same then it only defines the upper bits. This
;; helps slightly reduce register pressure by ensuring only one register here is
;; clobbered.
(rule 2 (lower (umulhi a @ (value_type (ty_32_or_64 ty)) b))
        (if-let true (use_bmi2))
        (x64_mulx_hi ty a b))

;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (smulhi a @ (value_type $I8) b))
        (x64_sar $I16 (x64_mul8 true a b) (imm8_to_imm8_gpr 8)))

(rule 1 (lower (smulhi a @ (value_type (ty_int_ref_16_to_64 ty)) b))
        (value_regs_get_gpr (x64_mul ty true a b) 1))

;; Rules for `get_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (get_pinned_reg))
      (read_pinned_gpr))

;; Rules for `set_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (set_pinned_reg a @ (value_type ty)))
      (side_effect (write_pinned_gpr a)))

;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty (vconst const)))
      ;; TODO use Inst::gen_constant() instead.
      (x64_xmm_load_const ty (const_to_vconst const)))

;; Special cases for known constant patterns to skip a 16-byte load.
(rule 1 (lower (has_type ty (vconst (u128_from_constant 0)))) (xmm_zero ty))
(rule 1 (lower (has_type ty (vconst (u128_from_constant -1)))) (vector_all_ones))

;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Special case for `pblendw` which takes an 8-bit immediate where each bit
;; indicates which lane of the two operands is chosen for the output. A bit of
;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
;; corresponding 16-bit lane from `b`.
(rule 14 (lower (shuffle a b (pblendw_imm n)))
         (if-let true (use_sse41))
         (x64_pblendw a b n))
(decl pblendw_imm (u8) Immediate)
(extern extractor pblendw_imm pblendw_imm)

;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
;; bytes", that's a `palignr` instruction. Note that the order of operands are
;; swapped in the instruction here. The `palignr` instruction uses the second
;; operand as the low-order bytes and the first operand as high-order bytes,
;; so put `a` second.
(rule 13 (lower (shuffle a b (palignr_imm_from_immediate n)))
         (if-let true (use_ssse3))
         (x64_palignr b a n))
(decl palignr_imm_from_immediate (u8) Immediate)
(extern extractor palignr_imm_from_immediate palignr_imm_from_immediate)

;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
;; integers within one value, preserving the other four 16-bit integers in that
;; value (either the high or low half). The complicated logic is in the
;; extractors here implemented in Rust and note that there's two cases for each
;; instruction here to match when either the first or second shuffle operand is
;; used.
(rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm)))
      (x64_pshuflw x imm))
(rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm)))
      (x64_pshuflw y imm))
(rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm)))
      (x64_pshufhw x imm))
(rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm)))
      (x64_pshufhw y imm))

(decl pshuflw_lhs_imm (u8) Immediate)
(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
(decl pshuflw_rhs_imm (u8) Immediate)
(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
(decl pshufhw_lhs_imm (u8) Immediate)
(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
(decl pshufhw_rhs_imm (u8) Immediate)
(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)

;; Special case for the `pshufd` instruction which will permute 32-bit values
;; within a single register. This is only applicable if the `imm` specified
;; selects 32-bit values from either `x` or `y`, but not both. This means
;; there's one rule for selecting from `x` and another rule for selecting from
;; `y`.
(rule 8 (lower (shuffle x y (pshufd_lhs_imm imm)))
      (x64_pshufd x imm))
(rule 7 (lower (shuffle x y (pshufd_rhs_imm imm)))
      (x64_pshufd y imm))

(decl pshufd_lhs_imm (u8) Immediate)
(extern extractor pshufd_lhs_imm pshufd_lhs_imm)
(decl pshufd_rhs_imm (u8) Immediate)
(extern extractor pshufd_rhs_imm pshufd_rhs_imm)

;; Special case for i8-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
      (x64_punpckhbw a b))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
      (x64_punpcklbw a b))

;; Special case for i16-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
      (x64_punpckhwd a b))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
      (x64_punpcklwd a b))

;; Special case for i32-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
      (x64_punpckhdq a b))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
      (x64_punpckldq a b))

;; Special case for i64-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
      (x64_punpckhqdq a b))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
      (x64_punpcklqdq a b))

;; If the vector shift mask is all 0s then that means the first byte of the
;; first operand is broadcast to all bytes. Falling through would load an
;; all-zeros constant from a rip-relative location but it should be slightly
;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
;; register.
(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
        (if-let true (use_ssse3))
        (x64_pshufb a (xmm_zero $I8X16)))

;; Special case for the `shufps` instruction which will select two 32-bit values
;; from the first operand and two 32-bit values from the second operand. Note
;; that there is a second case here as well for when the operands can be
;; swapped.
;;
;; Note that the priority of this instruction is currently lower than the above
;; special cases since `shufps` handles many of them and for now it's
;; hypothesized that the dedicated instructions are better than `shufps`.
;; Someone with more knowledge about x86 timings should perhaps reorder the
;; rules here eventually though.
(rule 5 (lower (shuffle x y (shufps_imm imm)))
      (x64_shufps x y imm))
(rule 4 (lower (shuffle x y (shufps_rev_imm imm)))
      (x64_shufps y x imm))

(decl shufps_imm(u8) Immediate)
(extern extractor shufps_imm shufps_imm)
(decl shufps_rev_imm(u8) Immediate)
(extern extractor shufps_rev_imm shufps_rev_imm)


;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
;; register. We statically build `constructed_mask` to zero out any unknown lane
;; indices (may not be completely necessary: verification could fail incorrect
;; mask values) and fix the indexes to all point to the `dst` vector.
(rule 3 (lower (shuffle a a (vec_mask_from_immediate mask)))
        (if-let true (use_ssse3))
        (x64_pshufb a (shuffle_0_31_mask mask)))

;; For the case where the shuffle mask contains out-of-bounds values (values
;; greater than 31) we must mask off those resulting values in the result of
;; `vpermi2b`.
(rule 2 (lower (shuffle a b (vec_mask_from_immediate (perm_from_mask_with_zeros mask zeros))))
      (if-let true (use_avx512vl))
      (if-let true (use_avx512vbmi))
      (x64_andps (x64_vpermi2b (x64_xmm_load_const $I8X16 mask) a b) zeros))

;; However, if the shuffle mask contains no out-of-bounds values, we can use
;; `vpermi2b` without any masking.
(rule 1 (lower (shuffle a b (vec_mask_from_immediate mask)))
      (if-let true (use_avx512vl))
      (if-let true (use_avx512vbmi))
      (x64_vpermi2b (x64_xmm_load_const $I8X16 (perm_from_mask mask)) a b))

;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR
;; them together. This is necessary due to PSHUFB semantics. As in the case
;; above, we build the `constructed_mask` for each case statically.
(rule (lower (shuffle a b (vec_mask_from_immediate mask)))
      (x64_por
        (lower_pshufb a (shuffle_0_15_mask mask))
        (lower_pshufb b (shuffle_16_31_mask mask))))

;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; SIMD swizzle; the following inefficient implementation is due to the Wasm
;; SIMD spec requiring mask indexes greater than 15 to have the same semantics
;; as a 0 index. For the spec discussion, see
;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the
;; Wasm SIMD semantics for this instruction. The instruction format maps to
;; variables like: %dst = swizzle %src, %mask
(rule (lower (swizzle src mask))
      (let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070))))
        (lower_pshufb src mask)))

;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (x86_pshufb src mask))
      (if-let true (use_ssse3))
      (x64_pshufb src mask))

;; A helper function to generate either the `pshufb` instruction or a libcall to
;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most
;; performant thing in the world so this is primarily here for completeness
;; of lowerings on all x86 cpus but if rules are ideally gated on the presence
;; of SSSE3 to use the `pshufb` instruction itself.
(decl lower_pshufb (Xmm RegMem) Xmm)
(rule 1 (lower_pshufb src mask)
        (if-let true (use_ssse3))
        (x64_pshufb src mask))
(rule (lower_pshufb src (RegMem.Reg mask))
      (libcall_2 (LibCall.X86Pshufb) src mask))
(rule (lower_pshufb src (RegMem.Mem addr))
      (lower_pshufb src (x64_movdqu_load addr)))

;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Remove the extractlane instruction, leaving the float where it is. The upper
;; bits will remain unchanged; for correctness, this relies on Cranelift type
;; checking to avoid using those bits.
(rule 3 (lower (has_type (ty_scalar_float _) (extractlane val 0)))
        val)

;; `f32x4.extract_lane N` where `N != 0`
(rule 1 (lower (extractlane val @ (value_type $F32X4) (u8_from_uimm8 lane)))
        (x64_pshufd val lane))

;; `f64x2.extract_lane N` where `N != 0` (aka N == 1)
(rule (lower (extractlane val @ (value_type $F64X2) 1))
      (x64_pshufd val 0b11_10_11_10))

;; `i8x16.extract_lane N`
;;
;; Note that without SSE4.1 a 16-bit lane extraction is performed and then
;; the result is updated if the desired index is either odd or even.
(rule 2 (lower (extractlane val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane)))
        (if-let true (use_sse41))
        (x64_pextrb val lane))
;; extracting an odd lane has an extra shift-right
(rule 1 (lower (extractlane val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane)))
        (if-let 1 (u8_and lane 1))
        (x64_shr $I16 (x64_pextrw val (u8_shr lane 1)) (Imm8Reg.Imm8 8)))
;; Extracting an even lane already has the desired lane in the lower bits. Note
;; that having arbitrary upper bits in the returned register should be ok since
;; all operators on the resulting `i8` type should work correctly regardless of
;; the bits in the rest of the register.
(rule (lower (extractlane val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane)))
      (if-let 0 (u8_and lane 1))
      (x64_pextrw val (u8_shr lane 1)))

;; `i16x8.extract_lane N`
(rule (lower (extractlane val @ (value_type ty @ $I16X8) (u8_from_uimm8 lane)))
      (x64_pextrw val lane))

;; `i32x4.extract_lane N`
(rule 2 (lower (extractlane val @ (value_type ty @ $I32X4) (u8_from_uimm8 lane)))
        (if-let true (use_sse41))
        (x64_pextrd val lane))
(rule 1 (lower (extractlane val @ (value_type $I32X4) 0))
        (x64_movd_to_gpr val))
(rule (lower (extractlane val @ (value_type $I32X4) (u8_from_uimm8 n)))
      (x64_movd_to_gpr (x64_pshufd val n)))

;; `i64x2.extract_lane N`
(rule 1 (lower (extractlane val @ (value_type $I64X2) (u8_from_uimm8 lane)))
        (if-let true (use_sse41))
        (x64_pextrq val lane))
(rule (lower (extractlane val @ (value_type $I64X2) 0))
      (x64_movq_to_gpr val))
(rule (lower (extractlane val @ (value_type $I64X2) 1))
      (x64_movq_to_gpr (x64_pshufd val 0b00_00_11_10)))

;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Case 1: when moving a scalar float, we simply move from one XMM register
;; to another, expecting the register allocator to elide this. Here we
;; assume that the upper bits of a scalar float have not been munged with
;; (the same assumption the old backend makes).
(rule 1 (lower (scalar_to_vector src @ (value_type (ty_scalar_float _))))
      src)

;; Case 2: when moving a scalar value of any other type, use MOVD to zero
;; the upper lanes.
(rule (lower (scalar_to_vector src @ (value_type ty)))
      (bitcast_gpr_to_xmm (ty_bits ty) src))

;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
;; MOVSS/MOVSD instruction.
(rule 2 (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_32 _)))))
      (x64_movss_load src))
(rule 3 (lower (scalar_to_vector (and (sinkable_load src) (value_type (ty_64 _)))))
      (x64_movsd_load src))

;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; For all the splat rules below one of the goals is that splatting a value
;; doesn't end up accidentally depending on the previous value in a register.
;; This means that instructions are chosen to avoid false dependencies where
;; new values are created fresh or otherwise overwrite previous register
;; contents where possible.
;;
;; Additionally splats are specialized to special-case load-and-splat which
;; has a number of micro-optimizations available.

;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
;; with a mask of zero which is calculated with an xor-against-itself register.
(rule 0 (lower (has_type $I8X16 (splat src)))
        (let ((src Xmm (x64_movd_to_xmm src)))
          (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0)))
(rule 1 (lower (has_type $I8X16 (splat src)))
        (if-let true (use_ssse3))
        (x64_pshufb (bitcast_gpr_to_xmm 32 src) (xmm_zero $I8X16)))
(rule 2 (lower (has_type $I8X16 (splat src)))
        (if-let true (use_avx2))
        (x64_vpbroadcastb (bitcast_gpr_to_xmm 32 src)))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
        (if-let true (use_sse41))
        (if-let true (use_ssse3))
        (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
(rule 4 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
        (if-let true (use_avx2))
        (x64_vpbroadcastb addr))

;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is
;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane
;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which
;; at that point is two of the 16-bit values we want to broadcast) to all the
;; lanes.
(rule 0 (lower (has_type $I16X8 (splat src)))
        (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm 32 src) 0) 0))
(rule 1 (lower (has_type $I16X8 (splat src)))
        (if-let true (use_avx2))
        (x64_vpbroadcastw (bitcast_gpr_to_xmm 32 src)))
(rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
        (x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0))
(rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
        (if-let true (use_avx2))
        (x64_vpbroadcastw addr))

;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be
;; used to broadcast the low lane to all other lanes.
;;
;; Note that sinkable-load cases come later
(rule 0 (lower (has_type $I32X4 (splat src)))
        (x64_pshufd (bitcast_gpr_to_xmm 32 src) 0))
(rule 1 (lower (has_type $I32X4 (splat src)))
        (if-let true (use_avx2))
        (x64_vpbroadcastd (bitcast_gpr_to_xmm 32 src)))

;; f32x4.splat - the source is already in an xmm register so `shufps` is all
;; that's necessary to complete the splat. This is specialized to `vbroadcastss`
;; on AVX2 to leverage that specific instruction for this operation.
(rule 0 (lower (has_type $F32X4 (splat src)))
        (let ((tmp Xmm src))
          (x64_shufps src src 0)))
(rule 1 (lower (has_type $F32X4 (splat src)))
        (if-let true (use_avx2))
        (x64_vbroadcastss src))

;; t32x4.splat of a load - use a `movss` to load into an xmm register and then
;; `shufps` broadcasts to the other lanes. Note that this is used for both i32
;; and f32 splats.
;;
;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note
;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but
;; the register-based encoding is only available with AVX2. With the
;; `sinkable_load` extractor this should be guaranteed to use the memory-based
;; encoding hence the `use_avx` test.
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
        (let ((tmp Xmm (x64_movss_load addr)))
          (x64_shufps tmp tmp 0)))
(rule 6 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
        (if-let true (use_avx))
        (x64_vbroadcastss addr))

;; t64x2.splat - use `pshufd` to broadcast the lower 64-bit lane to the upper
;; lane. A minor specialization for sinkable loads to avoid going through a gpr
;; for i64 splats is used as well when `movddup` is available.
(rule 0 (lower (has_type $I64X2 (splat src)))
        (x64_pshufd (bitcast_gpr_to_xmm 64 src) 0b01_00_01_00))
(rule 0 (lower (has_type $F64X2 (splat src)))
        (x64_pshufd src 0b01_00_01_00))
(rule 6 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
        (if-let true (use_ssse3))
        (x64_movddup addr))

;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (vany_true val))
        (if-let true (use_sse41))
        (let ((val Xmm val))
          (with_flags (x64_ptest val val) (x64_setcc (CC.NZ)))))

;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a
;; zeroed register and extract the high bits to a gpr mask. If the mask is
;; 0xffff then every byte was equal to zero, so test if the comparison is
;; not-equal or NZ.
(rule (lower (vany_true val))
      (lower_icmp_bool (emit_vany_true val)))

(decl emit_vany_true (Value) IcmpCondResult)
(rule (emit_vany_true val)
      (let (
          (any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16)))
          (mask Gpr (x64_pmovmskb (OperandSize.Size32) any_byte_zero))
        )
        (icmp_cond_result (x64_cmp_imm (OperandSize.Size32) mask 0xffff)
                          (CC.NZ))))

;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (vall_true val))
      (lower_icmp_bool (emit_vall_true val)))

(decl emit_vall_true (Value) IcmpCondResult)
(rule 1 (emit_vall_true val @ (value_type ty))
        (if-let true (use_sse41))
        (let ((src Xmm val)
              (zeros Xmm (xmm_zero ty))
              (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
          (icmp_cond_result (x64_ptest cmp cmp) (CC.Z))))

;; Perform an appropriately-sized lane-wise comparison with zero. If the
;; result is all 0s then all of them are true because nothing was equal to
;; zero.
(rule (emit_vall_true val @ (value_type ty))
      (let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty)))
            (mask Gpr (x64_pmovmskb (OperandSize.Size32) lanes_with_zero)))
        (icmp_cond_result (x64_test (OperandSize.Size32) mask mask)
                          (CC.Z))))

;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; The Intel specification allows using both 32-bit and 64-bit GPRs as
;; destination for the "move mask" instructions. This is controlled by the REX.R
;; bit: "In 64-bit mode, the instruction can access additional registers when
;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode"
;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we
;; will always clear REX.W since its use is unnecessary (`OperandSize` is used
;; for setting/clearing REX.W) as we need at most 16 bits of output for
;; `vhigh_bits`.

(rule (lower (vhigh_bits val @ (value_type (multi_lane 8 16))))
      (x64_pmovmskb (OperandSize.Size32) val))

(rule (lower (vhigh_bits val @ (value_type (multi_lane 32 4))))
      (x64_movmskps (OperandSize.Size32) val))

(rule (lower (vhigh_bits val @ (value_type (multi_lane 64 2))))
      (x64_movmskpd (OperandSize.Size32) val))

;; There is no x86 instruction for extracting the high bit of 16-bit lanes so
;; here we:
;; - duplicate the 16-bit lanes of `src` into 8-bit lanes:
;;     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
;; - use PMOVMSKB to gather the high bits; now we have duplicates, though
;; - shift away the bottom 8 high bits to remove the duplicates.
(rule (lower (vhigh_bits val @ (value_type (multi_lane 16 8))))
      (let ((src Xmm val)
            (tmp Xmm (x64_packsswb src src))
            (tmp Gpr (x64_pmovmskb (OperandSize.Size32) tmp)))
        (x64_shr $I64 tmp (Imm8Reg.Imm8 8))))

;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (iconcat lo @ (value_type $I64) hi))
      (value_regs lo hi))

;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (isplit val @ (value_type $I128)))
      (let ((regs ValueRegs val)
            (lo Reg (value_regs_get regs 0))
            (hi Reg (value_regs_get regs 1)))
        (output_pair lo hi)))

;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value (symbol_value_data name _ _))))
      (elf_tls_get_addr name))

(rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value (symbol_value_data name _ _))))
      (macho_tls_get_addr name))

(rule (lower (has_type (tls_model (TlsModel.Coff)) (tls_value (symbol_value_data name _ _))))
      (coff_tls_get_addr name))

;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (sqmul_round_sat qx @ (value_type $I16X8) qy))
        (if-let true (use_ssse3))
        (let ((src1 Xmm qx)
              (src2 Xmm qy)

              (mask XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000))
              (dst Xmm (x64_pmulhrsw src1 src2))
              (cmp Xmm (x64_pcmpeqw dst mask)))
          (x64_pxor dst cmp)))

;; This operation is defined in wasm as:
;;
;;    S.SignedSaturate((x * y + 0x4000) >> 15)
;;
;; so perform all those operations here manually with a lack of the native
;; instruction.
(rule (lower (sqmul_round_sat qx @ (value_type $I16X8) qy))
      (let (
          (qx Xmm qx)
          (qy Xmm qy)
          ;; Multiply `qx` and `qy` generating 32-bit intermediate results. The
          ;; 32-bit results have their low-halves stored in `mul_lsb` and the
          ;; high halves are stored in `mul_msb`. These are then shuffled into
          ;; `mul_lo` and `mul_hi` which represent the low 4 multiplications
          ;; and the upper 4 multiplications.
          (mul_lsb Xmm (x64_pmullw qx qy))
          (mul_msb Xmm (x64_pmulhw qx qy))
          (mul_lo Xmm (x64_punpcklwd mul_lsb mul_msb))
          (mul_hi Xmm (x64_punpckhwd mul_lsb mul_msb))
          ;; Add the 0x4000 constant to all multiplications
          (val Xmm (x64_movdqu_load (emit_u128_le_const 0x00004000_00004000_00004000_00004000)))
          (mul_lo Xmm (x64_paddd mul_lo val))
          (mul_hi Xmm (x64_paddd mul_hi val))
          ;; Perform the right-shift by 15 to all multiplications
          (lo Xmm (x64_psrad mul_lo (xmi_imm 15)))
          (hi Xmm (x64_psrad mul_hi (xmi_imm 15)))
        )
        ;; And finally perform a saturating 32-to-16-bit conversion.
        (x64_packssdw lo hi)))

;; Rules for `x86_pmulhrsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (x86_pmulhrsw qx @ (value_type $I16X8) qy))
      (if-let true (use_ssse3))
      (x64_pmulhrsw qx qy))

;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO: currently we only lower a special case of `uunarrow` needed to support
;; the translation of wasm's i32x4.trunc_sat_f64x2_u_zero operation.
;; https://github.com/bytecodealliance/wasmtime/issues/4791
;;
;; y = i32x4.trunc_sat_f64x2_u_zero(x) is lowered to:
;; MOVAPD xmm_y, xmm_x
;; XORPD xmm_tmp, xmm_tmp
;; MAXPD xmm_y, xmm_tmp
;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
;; ROUNDPD xmm_y, xmm_y, 0x0B
;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
;; SHUFPS xmm_y, xmm_xmp, 0x88
(rule (lower (uunarrow (fcvt_to_uint_sat src @ (value_type $F64X2))
                       (vconst (u128_from_constant 0))))
      (let ((src Xmm src)

            ;; MOVAPD xmm_y, xmm_x
            ;; XORPD xmm_tmp, xmm_tmp
            (zeros Xmm (xmm_zero $F64X2))
            (dst Xmm (x64_maxpd src zeros))

            ;; 4294967295.0 is equivalent to 0x41EFFFFFFFE00000
            (umax_mask XmmMem (emit_u128_le_const 0x41EFFFFFFFE00000_41EFFFFFFFE00000))

            ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
            (dst Xmm (x64_minpd dst umax_mask))

            ;; ROUNDPD xmm_y, xmm_y, 0x0B
            (dst Xmm (x64_round $F64X2 dst (RoundImm.RoundZero)))

            ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
            (uint_mask XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000))

            (dst Xmm (x64_addpd dst uint_mask)))

        ;; SHUFPS xmm_y, xmm_xmp, 0x88
        (x64_shufps dst zeros 0x88)))

;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (nop))
      (invalid_reg))