diff --git a/lib/compiler-singlepass/src/compiler.rs b/lib/compiler-singlepass/src/compiler.rs index 961712556f5..5e723a3cfc8 100644 --- a/lib/compiler-singlepass/src/compiler.rs +++ b/lib/compiler-singlepass/src/compiler.rs @@ -180,7 +180,7 @@ impl Compiler for SinglepassCompiler { generator.finalize(input) } Architecture::Aarch64(_) => { - let machine = MachineARM64::new(); + let machine = MachineARM64::new(Some(target.clone())); let mut generator = FuncGen::new( module, &self.config, diff --git a/lib/compiler-singlepass/src/emitter_arm64.rs b/lib/compiler-singlepass/src/emitter_arm64.rs index aa5ee80f2c6..aa7d7c90443 100644 --- a/lib/compiler-singlepass/src/emitter_arm64.rs +++ b/lib/compiler-singlepass/src/emitter_arm64.rs @@ -466,6 +466,15 @@ pub trait EmitterARM64 { dst: Location, ) -> Result<(), CompileError>; + fn emit_fmov( + &mut self, + src_size: Size, + src: Location, + dst_size: Size, + dst: Location, + ) -> Result<(), CompileError>; + fn emit_cnt(&mut self, src: NEON, dst: NEON) -> Result<(), CompileError>; + fn emit_addv(&mut self, src: NEON, dst: NEON) -> Result<(), CompileError>; fn emit_read_fpcr(&mut self, reg: GPR) -> Result<(), CompileError>; fn emit_write_fpcr(&mut self, reg: GPR) -> Result<(), CompileError>; fn emit_read_fpsr(&mut self, reg: GPR) -> Result<(), CompileError>; @@ -3217,6 +3226,56 @@ impl EmitterARM64 for Assembler { dynasm!(self ; msr 0b1_011_0100_0100_001, X(reg as u32)); Ok(()) } + + fn emit_cnt(&mut self, src: NEON, dst: NEON) -> Result<(), CompileError> { + dynasm!(self ; cnt V(dst.into_index() as u32).B8, V(src.into_index() as u32).B8); + Ok(()) + } + + fn emit_addv(&mut self, src: NEON, dst: NEON) -> Result<(), CompileError> { + dynasm!(self ; addv B(dst.into_index() as u32), V(src.into_index() as u32).B8); + Ok(()) + } + + fn emit_fmov( + &mut self, + src_size: Size, + src: Location, + dst_size: Size, + dst: Location, + ) -> Result<(), CompileError> { + let res = match (src, dst) { + (AbstractLocation::GPR(src), AbstractLocation::SIMD(dst)) => { + match (src_size, dst_size) { + (Size::S32, Size::S32) => Ok(dynasm!(self ; fmov S(dst as u32), W(src as u32))), + (Size::S64, Size::S64) => Ok(dynasm!(self ; fmov D(dst as u32), X(src as u32))), + _ => Err(()), + } + } + (AbstractLocation::SIMD(src), AbstractLocation::GPR(dst)) => { + match (src_size, dst_size) { + (Size::S32, Size::S32) => Ok(dynasm!(self ; fmov W(dst as u32), S(src as u32))), + (Size::S64, Size::S64) => Ok(dynasm!(self ; fmov X(dst as u32), D(src as u32))), + _ => Err(()), + } + } + // (AbstractLocation::SIMD(src), AbstractLocation::SIMD(dst)) => todo!(), + // (AbstractLocation::SIMD(src), AbstractLocation::Imm32(dst)) => todo!(), + // (AbstractLocation::SIMD(src), AbstractLocation::Imm64(dst)) => todo!(), + _ => Err(()), + }; + + match res { + Ok(_) => Ok(()), + Err(_) => codegen_error!( + "singlepass can't generate fmov instruction for src_size: {:?}, src: {:?}, dst_size: {:?}, dst: {:?}", + src_size, + src, + dst_size, + dst, + ), + } + } } pub fn gen_std_trampoline_arm64( diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs index d4602ae77fe..e3ca24ba302 100644 --- a/lib/compiler-singlepass/src/machine.rs +++ b/lib/compiler-singlepass/src/machine.rs @@ -2411,7 +2411,7 @@ pub fn gen_std_trampoline( machine.gen_std_trampoline(sig, calling_convention) } Architecture::Aarch64(_) => { - let machine = MachineARM64::new(); + let machine = MachineARM64::new(Some(target.clone())); machine.gen_std_trampoline(sig, calling_convention) } _ => Err(CompileError::UnsupportedTarget( @@ -2433,7 +2433,7 @@ pub fn gen_std_dynamic_import_trampoline( machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention) } Architecture::Aarch64(_) => { - let machine = MachineARM64::new(); + let machine = MachineARM64::new(Some(target.clone())); machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention) } _ => Err(CompileError::UnsupportedTarget( @@ -2455,7 +2455,7 @@ pub fn gen_import_call_trampoline( machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention) } Architecture::Aarch64(_) => { - let machine = MachineARM64::new(); + let machine = MachineARM64::new(Some(target.clone())); machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention) } _ => Err(CompileError::UnsupportedTarget( diff --git a/lib/compiler-singlepass/src/machine_arm64.rs b/lib/compiler-singlepass/src/machine_arm64.rs index f5e2b0c1b18..2518dec0a0a 100644 --- a/lib/compiler-singlepass/src/machine_arm64.rs +++ b/lib/compiler-singlepass/src/machine_arm64.rs @@ -4,9 +4,9 @@ use gimli::{write::CallFrameInstruction, AArch64}; use wasmer_compiler::wasmparser::ValType as WpType; use wasmer_types::{ - CallingConvention, CompileError, CustomSection, FunctionBody, FunctionIndex, FunctionType, - InstructionAddressMap, Relocation, RelocationKind, RelocationTarget, SourceLoc, TrapCode, - TrapInformation, VMOffsets, + CallingConvention, CompileError, CpuFeature, CustomSection, FunctionBody, FunctionIndex, + FunctionType, InstructionAddressMap, Relocation, RelocationKind, RelocationTarget, SourceLoc, + Target, TrapCode, TrapInformation, VMOffsets, }; use crate::arm64_decl::new_machine_state; @@ -105,7 +105,6 @@ pub struct MachineARM64 { used_simd: u32, trap_table: TrapTable, /// Map from byte offset into wasm function to range of native instructions. - /// // Ordered by increasing InstructionAddressMap::srcloc. instructions_address_map: Vec, /// The source location for the current operator. @@ -114,6 +113,8 @@ pub struct MachineARM64 { pushed: bool, /// Vector of unwind operations with offset unwind_ops: Vec<(usize, UnwindOps)>, + /// A boolean flag signaling if this machine supports NEON. + has_neon: bool, } #[allow(dead_code)] @@ -138,7 +139,15 @@ enum ImmType { #[allow(dead_code)] impl MachineARM64 { - pub fn new() -> Self { + pub fn new(target: Option) -> Self { + // If and when needed, checks for other supported features should be + // added as boolean fields in the struct to make checking if such + // features are available as cheap as possible. + let has_neon = match target { + Some(ref target) => target.cpu_features().contains(CpuFeature::NEON), + None => false, + }; + MachineARM64 { assembler: Assembler::new(0), used_gprs: 0, @@ -148,6 +157,7 @@ impl MachineARM64 { src_loc: 0, pushed: false, unwind_ops: vec![], + has_neon, } } fn compatible_imm(&self, imm: i64, ty: ImmType) -> bool { @@ -3072,48 +3082,82 @@ impl Machine for MachineARM64 { Ok(()) } fn i32_popcnt(&mut self, loc: Location, ret: Location) -> Result<(), CompileError> { - // no opcode for that. - // 2 solutions: using NEON CNT, that count bits per Byte, or using clz with some shift and loop - let mut temps = vec![]; - let src = self.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, true, None)?; - let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?; - let src = if src == loc { - let tmp = self.acquire_temp_gpr().ok_or_else(|| { + if self.has_neon { + let mut temps = vec![]; + + let src_gpr = + self.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, true, None)?; + let dst_gpr = + self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?; + + let mut neon_temps = vec![]; + let neon_temp = self.acquire_temp_simd().ok_or_else(|| { CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) })?; - temps.push(tmp); + neon_temps.push(neon_temp); + self.assembler - .emit_mov(Size::S32, src, Location::GPR(tmp))?; - Location::GPR(tmp) + .emit_fmov(Size::S32, src_gpr, Size::S32, Location::SIMD(neon_temp))?; + self.assembler.emit_cnt(neon_temp, neon_temp)?; + self.assembler.emit_addv(neon_temp, neon_temp)?; + self.assembler + .emit_fmov(Size::S32, Location::SIMD(neon_temp), Size::S32, dst_gpr)?; + + if ret != dst_gpr { + self.move_location(Size::S32, dst_gpr, ret)?; + } + + for r in temps { + self.release_gpr(r); + } + + for r in neon_temps { + self.release_simd(r); + } } else { - src - }; - let tmp = { - let tmp = self.acquire_temp_gpr().ok_or_else(|| { - CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) - })?; - temps.push(tmp); - Location::GPR(tmp) - }; - let label_loop = self.assembler.get_label(); - let label_exit = self.assembler.get_label(); - self.assembler - .emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest)?; // 0 => dest - self.assembler.emit_cbz_label(Size::S32, src, label_exit)?; // src==0, exit - self.assembler.emit_label(label_loop)?; // loop: - self.assembler - .emit_add(Size::S32, dest, Location::Imm8(1), dest)?; // dest += 1 - self.assembler.emit_clz(Size::S32, src, tmp)?; // clz src => tmp - self.assembler.emit_lsl(Size::S32, src, tmp, src)?; // src << tmp => src - self.assembler - .emit_lsl(Size::S32, src, Location::Imm8(1), src)?; // src << 1 => src - self.assembler.emit_cbnz_label(Size::S32, src, label_loop)?; // if src!=0 goto loop - self.assembler.emit_label(label_exit)?; - if ret != dest { - self.move_location(Size::S32, dest, ret)?; - } - for r in temps { - self.release_gpr(r); + let mut temps = vec![]; + let src = + self.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, true, None)?; + let dest = + self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?; + let src = if src == loc { + let tmp = self.acquire_temp_gpr().ok_or_else(|| { + CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) + })?; + temps.push(tmp); + self.assembler + .emit_mov(Size::S32, src, Location::GPR(tmp))?; + Location::GPR(tmp) + } else { + src + }; + let tmp = { + let tmp = self.acquire_temp_gpr().ok_or_else(|| { + CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) + })?; + temps.push(tmp); + Location::GPR(tmp) + }; + let label_loop = self.assembler.get_label(); + let label_exit = self.assembler.get_label(); + self.assembler + .emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest)?; // 0 => dest + self.assembler.emit_cbz_label(Size::S32, src, label_exit)?; // src==0, exit + self.assembler.emit_label(label_loop)?; // loop: + self.assembler + .emit_add(Size::S32, dest, Location::Imm8(1), dest)?; // dest += 1 + self.assembler.emit_clz(Size::S32, src, tmp)?; // clz src => tmp + self.assembler.emit_lsl(Size::S32, src, tmp, src)?; // src << tmp => src + self.assembler + .emit_lsl(Size::S32, src, Location::Imm8(1), src)?; // src << 1 => src + self.assembler.emit_cbnz_label(Size::S32, src, label_loop)?; // if src!=0 goto loop + self.assembler.emit_label(label_exit)?; + if ret != dest { + self.move_location(Size::S32, dest, ret)?; + } + for r in temps { + self.release_gpr(r); + } } Ok(()) } @@ -5184,47 +5228,84 @@ impl Machine for MachineARM64 { Ok(()) } fn i64_popcnt(&mut self, loc: Location, ret: Location) -> Result<(), CompileError> { - let mut temps = vec![]; - let src = self.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, true, None)?; - let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?; - let src = if src == loc { - let tmp = self.acquire_temp_gpr().ok_or_else(|| { + if self.has_neon { + let mut temps = vec![]; + + let src_gpr = + self.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, true, None)?; + let dst_gpr = + self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?; + + let mut neon_temps = vec![]; + let neon_temp = self.acquire_temp_simd().ok_or_else(|| { CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) })?; - temps.push(tmp); + neon_temps.push(neon_temp); + self.assembler - .emit_mov(Size::S64, src, Location::GPR(tmp))?; - Location::GPR(tmp) + .emit_fmov(Size::S64, src_gpr, Size::S64, Location::SIMD(neon_temp))?; + self.assembler.emit_cnt(neon_temp, neon_temp)?; + self.assembler.emit_addv(neon_temp, neon_temp)?; + self.assembler + .emit_fmov(Size::S64, Location::SIMD(neon_temp), Size::S64, dst_gpr)?; + + if ret != dst_gpr { + self.move_location(Size::S64, dst_gpr, ret)?; + } + + for r in temps { + self.release_gpr(r); + } + + for r in neon_temps { + self.release_simd(r); + } } else { - src - }; - let tmp = { - let tmp = self.acquire_temp_gpr().ok_or_else(|| { - CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) - })?; - temps.push(tmp); - Location::GPR(tmp) - }; - let label_loop = self.assembler.get_label(); - let label_exit = self.assembler.get_label(); - self.assembler - .emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest)?; // dest <= 0 - self.assembler.emit_cbz_label(Size::S64, src, label_exit)?; // src == 0, then goto label_exit - self.assembler.emit_label(label_loop)?; - self.assembler - .emit_add(Size::S32, dest, Location::Imm8(1), dest)?; // dest += 1 - self.assembler.emit_clz(Size::S64, src, tmp)?; // clz src => tmp - self.assembler.emit_lsl(Size::S64, src, tmp, src)?; // src << tmp => src - self.assembler - .emit_lsl(Size::S64, src, Location::Imm8(1), src)?; // src << 1 => src - self.assembler.emit_cbnz_label(Size::S64, src, label_loop)?; // src != 0, then goto label_loop - self.assembler.emit_label(label_exit)?; - if ret != dest { - self.move_location(Size::S64, dest, ret)?; - } - for r in temps { - self.release_gpr(r); + let mut temps = vec![]; + let src = + self.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, true, None)?; + let dest = + self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?; + let src = if src == loc { + let tmp = self.acquire_temp_gpr().ok_or_else(|| { + CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) + })?; + temps.push(tmp); + self.assembler + .emit_mov(Size::S64, src, Location::GPR(tmp))?; + Location::GPR(tmp) + } else { + src + }; + let tmp = { + let tmp = self.acquire_temp_gpr().ok_or_else(|| { + CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned()) + })?; + temps.push(tmp); + Location::GPR(tmp) + }; + let label_loop = self.assembler.get_label(); + let label_exit = self.assembler.get_label(); + self.assembler + .emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest)?; // dest <= 0 + self.assembler.emit_cbz_label(Size::S64, src, label_exit)?; // src == 0, then goto label_exit + self.assembler.emit_label(label_loop)?; + self.assembler + .emit_add(Size::S32, dest, Location::Imm8(1), dest)?; // dest += 1 + self.assembler.emit_clz(Size::S64, src, tmp)?; // clz src => tmp + self.assembler.emit_lsl(Size::S64, src, tmp, src)?; // src << tmp => src + self.assembler + .emit_lsl(Size::S64, src, Location::Imm8(1), src)?; // src << 1 => src + self.assembler.emit_cbnz_label(Size::S64, src, label_loop)?; // src != 0, then goto label_loop + self.assembler.emit_label(label_exit)?; + if ret != dest { + self.move_location(Size::S64, dest, ret)?; + } + for r in temps { + self.release_gpr(r); + } } + Ok(()) } fn i64_shl( @@ -8781,7 +8862,7 @@ mod test { #[test] fn tests_arm64() -> Result<(), CompileError> { - let mut machine = MachineARM64::new(); + let mut machine = MachineARM64::new(None); test_move_location(&mut machine, Size::S32)?; test_move_location(&mut machine, Size::S64)?; diff --git a/lib/types/src/compilation/target.rs b/lib/types/src/compilation/target.rs index 176e1907cce..0d3d98f1449 100644 --- a/lib/types/src/compilation/target.rs +++ b/lib/types/src/compilation/target.rs @@ -48,6 +48,7 @@ pub enum CpuFeature { AVX512F, LZCNT, // ARM features + NEON, // Risc-V features } @@ -101,7 +102,20 @@ impl CpuFeature { } features } - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + + #[cfg(target_arch = "aarch64")] + /// Retrieves the features for the current Host + pub fn for_host() -> EnumSet { + let mut features = EnumSet::new(); + + if std::arch::is_aarch64_feature_detected!("neon") { + features.insert(Self::NEON); + } + + features + } + + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] /// Retrieves the features for the current Host pub fn for_host() -> EnumSet { // We default to an empty hash set @@ -140,6 +154,7 @@ impl FromStr for CpuFeature { "avx512vl" => Ok(Self::AVX512VL), "avx512f" => Ok(Self::AVX512F), "lzcnt" => Ok(Self::LZCNT), + "neon" => Ok(Self::NEON), _ => Err(ParseCpuFeatureError::Missing(s.to_string())), } } @@ -162,6 +177,7 @@ impl ToString for CpuFeature { Self::AVX512VL => "avx512vl", Self::AVX512F => "avx512f", Self::LZCNT => "lzcnt", + Self::NEON => "neon", } .to_string() }