Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(singlepass): use SIMD insts for popcount #4526

Merged
merged 6 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/compiler-singlepass/src/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ impl Compiler for SinglepassCompiler {
generator.finalize(input)
}
Architecture::Aarch64(_) => {
let machine = MachineARM64::new();
let machine = MachineARM64::new(Some(target.clone()));
let mut generator = FuncGen::new(
module,
&self.config,
Expand Down
6 changes: 3 additions & 3 deletions lib/compiler-singlepass/src/machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2411,7 +2411,7 @@ pub fn gen_std_trampoline(
machine.gen_std_trampoline(sig, calling_convention)
}
Architecture::Aarch64(_) => {
let machine = MachineARM64::new();
let machine = MachineARM64::new(Some(target.clone()));
machine.gen_std_trampoline(sig, calling_convention)
}
_ => Err(CompileError::UnsupportedTarget(
Expand All @@ -2433,7 +2433,7 @@ pub fn gen_std_dynamic_import_trampoline(
machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention)
}
Architecture::Aarch64(_) => {
let machine = MachineARM64::new();
let machine = MachineARM64::new(Some(target.clone()));
machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention)
}
_ => Err(CompileError::UnsupportedTarget(
Expand All @@ -2455,7 +2455,7 @@ pub fn gen_import_call_trampoline(
machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention)
}
Architecture::Aarch64(_) => {
let machine = MachineARM64::new();
let machine = MachineARM64::new(Some(target.clone()));
machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention)
}
_ => Err(CompileError::UnsupportedTarget(
Expand Down
224 changes: 166 additions & 58 deletions lib/compiler-singlepass/src/machine_arm64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ use gimli::{write::CallFrameInstruction, AArch64};

use wasmer_compiler::wasmparser::ValType as WpType;
use wasmer_types::{
CallingConvention, CompileError, CustomSection, FunctionBody, FunctionIndex, FunctionType,
InstructionAddressMap, Relocation, RelocationKind, RelocationTarget, SourceLoc, TrapCode,
TrapInformation, VMOffsets,
CallingConvention, CompileError, CpuFeature, CustomSection, FunctionBody, FunctionIndex,
FunctionType, InstructionAddressMap, Relocation, RelocationKind, RelocationTarget, SourceLoc,
Target, TrapCode, TrapInformation, VMOffsets,
};

use crate::arm64_decl::new_machine_state;
Expand Down Expand Up @@ -114,6 +114,8 @@ pub struct MachineARM64 {
pushed: bool,
/// Vector of unwind operations with offset
unwind_ops: Vec<(usize, UnwindOps)>,
/// The actual compilation target.
target: Option<Target>,
}

#[allow(dead_code)]
Expand All @@ -138,7 +140,7 @@ enum ImmType {

#[allow(dead_code)]
impl MachineARM64 {
pub fn new() -> Self {
pub fn new(target: Option<Target>) -> Self {
MachineARM64 {
assembler: Assembler::new(0),
used_gprs: 0,
Expand All @@ -148,6 +150,7 @@ impl MachineARM64 {
src_loc: 0,
pushed: false,
unwind_ops: vec![],
target,
}
}
fn compatible_imm(&self, imm: i64, ty: ImmType) -> bool {
Expand Down Expand Up @@ -3072,38 +3075,90 @@ impl Machine for MachineARM64 {
Ok(())
}
fn i32_popcnt(&mut self, loc: Location, ret: Location) -> Result<(), CompileError> {
let mut temps = vec![];

let src_gpr =
self.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, true, None)?;
let dst_gpr =
self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;

let mut neon_temps = vec![];
let neon_temp = self.acquire_temp_simd().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
neon_temps.push(neon_temp);
if self.target.is_some()
&& self
.target
.as_ref()
.unwrap()
.cpu_features()
.contains(CpuFeature::NEON)
xdoardo marked this conversation as resolved.
Show resolved Hide resolved
{
let mut temps = vec![];

let src_gpr =
self.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, true, None)?;
let dst_gpr =
self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;

let mut neon_temps = vec![];
let neon_temp = self.acquire_temp_simd().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
neon_temps.push(neon_temp);

self.assembler
.emit_fmov(Size::S32, src_gpr, Size::S32, Location::SIMD(neon_temp))?;
self.assembler.emit_cnt(neon_temp, neon_temp)?;
self.assembler.emit_addv(neon_temp, neon_temp)?;
self.assembler
.emit_fmov(Size::S32, Location::SIMD(neon_temp), Size::S32, dst_gpr)?;
self.assembler
.emit_fmov(Size::S32, src_gpr, Size::S32, Location::SIMD(neon_temp))?;
self.assembler.emit_cnt(neon_temp, neon_temp)?;
self.assembler.emit_addv(neon_temp, neon_temp)?;
self.assembler
.emit_fmov(Size::S32, Location::SIMD(neon_temp), Size::S32, dst_gpr)?;

if ret != dst_gpr {
self.move_location(Size::S32, dst_gpr, ret)?;
}
if ret != dst_gpr {
self.move_location(Size::S32, dst_gpr, ret)?;
}

for r in temps {
self.release_gpr(r);
}
for r in temps {
self.release_gpr(r);
}

for r in neon_temps {
self.release_simd(r);
for r in neon_temps {
self.release_simd(r);
}
} else {
let mut temps = vec![];
let src =
self.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, true, None)?;
let dest =
self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None)?;
let src = if src == loc {
let tmp = self.acquire_temp_gpr().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
temps.push(tmp);
self.assembler
.emit_mov(Size::S32, src, Location::GPR(tmp))?;
Location::GPR(tmp)
} else {
src
};
let tmp = {
let tmp = self.acquire_temp_gpr().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
temps.push(tmp);
Location::GPR(tmp)
};
let label_loop = self.assembler.get_label();
let label_exit = self.assembler.get_label();
self.assembler
.emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest)?; // 0 => dest
self.assembler.emit_cbz_label(Size::S32, src, label_exit)?; // src==0, exit
self.assembler.emit_label(label_loop)?; // loop:
self.assembler
.emit_add(Size::S32, dest, Location::Imm8(1), dest)?; // dest += 1
self.assembler.emit_clz(Size::S32, src, tmp)?; // clz src => tmp
self.assembler.emit_lsl(Size::S32, src, tmp, src)?; // src << tmp => src
self.assembler
.emit_lsl(Size::S32, src, Location::Imm8(1), src)?; // src << 1 => src
self.assembler.emit_cbnz_label(Size::S32, src, label_loop)?; // if src!=0 goto loop
self.assembler.emit_label(label_exit)?;
if ret != dest {
self.move_location(Size::S32, dest, ret)?;
}
for r in temps {
self.release_gpr(r);
}
}

Ok(())
}
fn i32_shl(
Expand Down Expand Up @@ -5173,36 +5228,89 @@ impl Machine for MachineARM64 {
Ok(())
}
fn i64_popcnt(&mut self, loc: Location, ret: Location) -> Result<(), CompileError> {
let mut temps = vec![];

let src_gpr =
self.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, true, None)?;
let dst_gpr =
self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;

let mut neon_temps = vec![];
let neon_temp = self.acquire_temp_simd().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
neon_temps.push(neon_temp);
if self.target.is_some()
&& self
.target
.as_ref()
.unwrap()
.cpu_features()
.contains(CpuFeature::NEON)
xdoardo marked this conversation as resolved.
Show resolved Hide resolved
{
let mut temps = vec![];

let src_gpr =
self.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, true, None)?;
let dst_gpr =
self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;

let mut neon_temps = vec![];
let neon_temp = self.acquire_temp_simd().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
neon_temps.push(neon_temp);

self.assembler
.emit_fmov(Size::S64, src_gpr, Size::S64, Location::SIMD(neon_temp))?;
self.assembler.emit_cnt(neon_temp, neon_temp)?;
self.assembler.emit_addv(neon_temp, neon_temp)?;
self.assembler
.emit_fmov(Size::S64, Location::SIMD(neon_temp), Size::S64, dst_gpr)?;
self.assembler
.emit_fmov(Size::S64, src_gpr, Size::S64, Location::SIMD(neon_temp))?;
self.assembler.emit_cnt(neon_temp, neon_temp)?;
self.assembler.emit_addv(neon_temp, neon_temp)?;
self.assembler
.emit_fmov(Size::S64, Location::SIMD(neon_temp), Size::S64, dst_gpr)?;

if ret != dst_gpr {
self.move_location(Size::S64, dst_gpr, ret)?;
}
if ret != dst_gpr {
self.move_location(Size::S64, dst_gpr, ret)?;
}

for r in temps {
self.release_gpr(r);
}
for r in temps {
self.release_gpr(r);
}

for r in neon_temps {
self.release_simd(r);
for r in neon_temps {
self.release_simd(r);
}
} else {
let mut temps = vec![];
let src =
self.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, true, None)?;
let dest =
self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None)?;
let src = if src == loc {
let tmp = self.acquire_temp_gpr().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
temps.push(tmp);
self.assembler
.emit_mov(Size::S64, src, Location::GPR(tmp))?;
Location::GPR(tmp)
} else {
src
};
let tmp = {
let tmp = self.acquire_temp_gpr().ok_or_else(|| {
CompileError::Codegen("singlepass cannot acquire temp gpr".to_owned())
})?;
temps.push(tmp);
Location::GPR(tmp)
};
let label_loop = self.assembler.get_label();
let label_exit = self.assembler.get_label();
self.assembler
.emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest)?; // dest <= 0
self.assembler.emit_cbz_label(Size::S64, src, label_exit)?; // src == 0, then goto label_exit
self.assembler.emit_label(label_loop)?;
self.assembler
.emit_add(Size::S32, dest, Location::Imm8(1), dest)?; // dest += 1
self.assembler.emit_clz(Size::S64, src, tmp)?; // clz src => tmp
self.assembler.emit_lsl(Size::S64, src, tmp, src)?; // src << tmp => src
self.assembler
.emit_lsl(Size::S64, src, Location::Imm8(1), src)?; // src << 1 => src
self.assembler.emit_cbnz_label(Size::S64, src, label_loop)?; // src != 0, then goto label_loop
self.assembler.emit_label(label_exit)?;
if ret != dest {
self.move_location(Size::S64, dest, ret)?;
}
for r in temps {
self.release_gpr(r);
}
}

Ok(())
Expand Down Expand Up @@ -8761,7 +8869,7 @@ mod test {

#[test]
fn tests_arm64() -> Result<(), CompileError> {
let mut machine = MachineARM64::new();
let mut machine = MachineARM64::new(None);

test_move_location(&mut machine, Size::S32)?;
test_move_location(&mut machine, Size::S64)?;
Expand Down
18 changes: 17 additions & 1 deletion lib/types/src/compilation/target.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pub enum CpuFeature {
AVX512F,
LZCNT,
// ARM features
NEON,
// Risc-V features
}

Expand Down Expand Up @@ -101,7 +102,20 @@ impl CpuFeature {
}
features
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]

#[cfg(target_arch = "aarch64")]
/// Retrieves the features for the current Host
pub fn for_host() -> EnumSet<Self> {
let mut features = EnumSet::new();

if std::arch::is_aarch64_feature_detected!("neon") {
features.insert(Self::NEON);
}

features
}

#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
/// Retrieves the features for the current Host
pub fn for_host() -> EnumSet<Self> {
// We default to an empty hash set
Expand Down Expand Up @@ -140,6 +154,7 @@ impl FromStr for CpuFeature {
"avx512vl" => Ok(Self::AVX512VL),
"avx512f" => Ok(Self::AVX512F),
"lzcnt" => Ok(Self::LZCNT),
"neon" => Ok(Self::NEON),
_ => Err(ParseCpuFeatureError::Missing(s.to_string())),
}
}
Expand All @@ -162,6 +177,7 @@ impl ToString for CpuFeature {
Self::AVX512VL => "avx512vl",
Self::AVX512F => "avx512f",
Self::LZCNT => "lzcnt",
Self::NEON => "neon",
}
.to_string()
}
Expand Down
Loading