From 9f1dcfcb0465fc420f30f2879fe668b6882257b8 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Wed, 15 Sep 2021 14:09:09 +0200 Subject: [PATCH 1/9] feat(compiler) Added windows x86_64 ABI to SinglePass compiler, but tests are not OK yet --- Makefile | 2 +- lib/compiler-singlepass/src/codegen_x64.rs | 205 +++++++++++++++------ lib/compiler-singlepass/src/compiler.rs | 8 +- lib/compiler-singlepass/src/emitter_x64.rs | 2 +- lib/compiler-singlepass/src/machine.rs | 11 ++ lib/compiler-singlepass/src/x64_decl.rs | 31 ++++ tests/ignores.txt | 1 - 7 files changed, 197 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index b73dc17d508..ce5a9536050 100644 --- a/Makefile +++ b/Makefile @@ -172,7 +172,7 @@ ifneq ($(ENABLE_SINGLEPASS), 0) ifeq ($(ENABLE_SINGLEPASS), 1) compilers += singlepass # … otherwise, we try to check whether Singlepass works on this host. - else ifneq (, $(filter 1, $(IS_DARWIN) $(IS_LINUX))) + else ifneq (, $(filter 1, $(IS_DARWIN) $(IS_LINUX) $(IS_WINDOWS))) ifeq ($(IS_AMD64), 1) compilers += singlepass endif diff --git a/lib/compiler-singlepass/src/codegen_x64.rs b/lib/compiler-singlepass/src/codegen_x64.rs index 9d5fe93b522..f337f7ea893 100644 --- a/lib/compiler-singlepass/src/codegen_x64.rs +++ b/lib/compiler-singlepass/src/codegen_x64.rs @@ -950,13 +950,13 @@ impl<'a> FuncGen<'a> { Ok(()) } - /// Emits a System V call sequence. + /// Emits a System V / Windows call sequence. /// /// This function will not use RAX before `cb` is called. /// /// The caller MUST NOT hold any temporary registers allocated by `acquire_temp_gpr` when calling /// this function. - fn emit_call_sysv, F: FnOnce(&mut Self)>( + fn emit_call_native, F: FnOnce(&mut Self)>( &mut self, cb: F, params: I, @@ -977,7 +977,7 @@ impl<'a> FuncGen<'a> { self.machine.state.register_values[X64Register::GPR(*r).to_index().0].clone(); if content == MachineValue::Undefined { return Err(CodegenError { - message: "emit_call_sysv: Undefined used_gprs content".to_string(), + message: "emit_call_native: Undefined used_gprs content".to_string(), }); } self.machine.state.stack_values.push(content); @@ -1004,13 +1004,18 @@ impl<'a> FuncGen<'a> { self.machine.state.register_values[X64Register::XMM(*r).to_index().0].clone(); if content == MachineValue::Undefined { return Err(CodegenError { - message: "emit_call_sysv: Undefined used_xmms content".to_string(), + message: "emit_call_native: Undefined used_xmms content".to_string(), }); } self.machine.state.stack_values.push(content); } } + #[cfg(target_os = "windows")] + let stack_padding: usize = 32; + #[cfg(not(target_os = "windows"))] + let stack_padding: usize = 0; + let mut stack_offset: usize = 0; // Calculate stack offset. @@ -1052,7 +1057,7 @@ impl<'a> FuncGen<'a> { let content = self.machine.state.register_values [X64Register::GPR(x).to_index().0] .clone(); - // FIXME: There might be some corner cases (release -> emit_call_sysv -> acquire?) that cause this assertion to fail. + // FIXME: There might be some corner cases (release -> emit_call_native -> acquire?) that cause this assertion to fail. // Hopefully nothing would be incorrect at runtime. //assert!(content != MachineValue::Undefined); @@ -1068,7 +1073,7 @@ impl<'a> FuncGen<'a> { Location::Memory(reg, offset) => { if reg != GPR::RBP { return Err(CodegenError { - message: "emit_call_sysv loc param: unreachable code" + message: "emit_call_native loc param: unreachable code" .to_string(), }); } @@ -1090,18 +1095,18 @@ impl<'a> FuncGen<'a> { // Dummy value slot to be filled with `mov`. self.assembler.emit_push(Size::S64, Location::GPR(GPR::RAX)); - // Use RCX as the temporary register here, since: + // Use R9 as the temporary register here, since: // - It is a temporary register that is not used for any persistent value. // - This register as an argument location is only written to after `sort_call_movs`.' - self.machine.reserve_unused_temp_gpr(GPR::RCX); + self.machine.reserve_unused_temp_gpr(GPR::R9); self.assembler - .emit_mov(Size::S64, *param, Location::GPR(GPR::RCX)); + .emit_mov(Size::S64, *param, Location::GPR(GPR::R9)); self.assembler.emit_mov( Size::S64, - Location::GPR(GPR::RCX), + Location::GPR(GPR::R9), Location::Memory(GPR::RSP, 0), ); - self.machine.release_temp_gpr(GPR::RCX); + self.machine.release_temp_gpr(GPR::R9); } Location::XMM(_) => { // Dummy value slot to be filled with `mov`. @@ -1119,7 +1124,7 @@ impl<'a> FuncGen<'a> { } _ => { return Err(CodegenError { - message: "emit_call_sysv loc: unreachable code".to_string(), + message: "emit_call_native loc: unreachable code".to_string(), }) } } @@ -1144,10 +1149,18 @@ impl<'a> FuncGen<'a> { if (self.machine.state.stack_values.len() % 2) != 1 { return Err(CodegenError { - message: "emit_call_sysv: explicit shadow takes one slot".to_string(), + message: "emit_call_native: explicit shadow takes one slot".to_string(), }); } + if stack_padding > 0 { + self.assembler.emit_sub( + Size::S64, + Location::Imm32(stack_padding as u32), + Location::GPR(GPR::RSP), + ); + } + cb(self); // Offset needs to be after the 'call' instruction. @@ -1170,15 +1183,15 @@ impl<'a> FuncGen<'a> { } // Restore stack. - if stack_offset > 0 { + if stack_offset + stack_padding > 0 { self.assembler.emit_add( Size::S64, - Location::Imm32(stack_offset as u32), + Location::Imm32((stack_offset + stack_padding) as u32), Location::GPR(GPR::RSP), ); if (stack_offset % 8) != 0 { return Err(CodegenError { - message: "emit_call_sysv: Bad restoring stack alignement".to_string(), + message: "emit_call_native: Bad restoring stack alignement".to_string(), }); } for _ in 0..stack_offset / 8 { @@ -1213,19 +1226,19 @@ impl<'a> FuncGen<'a> { if self.machine.state.stack_values.pop().unwrap() != MachineValue::ExplicitShadow { return Err(CodegenError { - message: "emit_call_sysv: Popped value is not ExplicitShadow".to_string(), + message: "emit_call_native: Popped value is not ExplicitShadow".to_string(), }); } Ok(()) } /// Emits a System V call sequence, specialized for labels as the call target. - fn _emit_call_sysv_label>( + fn _emit_call_native_label>( &mut self, label: DynamicLabel, params: I, ) -> Result<(), CodegenError> { - self.emit_call_sysv(|this| this.assembler.emit_call_label(label), params)?; + self.emit_call_native(|this| this.assembler.emit_call_label(label), params)?; Ok(()) } @@ -5196,7 +5209,7 @@ impl<'a> FuncGen<'a> { addend: 0, }); - // RAX is preserved on entry to `emit_call_sysv` callback. + // RAX is preserved on entry to `emit_call_native` callback. // The Imm64 value is relocated by the JIT linker. self.assembler.emit_mov( Size::S64, @@ -5204,7 +5217,7 @@ impl<'a> FuncGen<'a> { Location::GPR(GPR::RAX), ); - self.emit_call_sysv( + self.emit_call_native( |this| { let offset = this.assembler.get_offset().0; this.trap_table @@ -5391,7 +5404,7 @@ impl<'a> FuncGen<'a> { let vmcaller_checked_anyfunc_vmctx = self.vmoffsets.vmcaller_checked_anyfunc_vmctx() as usize; - self.emit_call_sysv( + self.emit_call_native( |this| { if this.assembler.arch_requires_indirect_call_trampoline() { this.assembler.arch_emit_indirect_call_with_trampoline( @@ -5673,7 +5686,7 @@ impl<'a> FuncGen<'a> { ), Location::GPR(GPR::RAX), ); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -5709,7 +5722,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 3? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -5739,7 +5752,7 @@ impl<'a> FuncGen<'a> { Location::GPR(GPR::RAX), ); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -5782,7 +5795,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 3? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -5831,7 +5844,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 3? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -5866,7 +5879,7 @@ impl<'a> FuncGen<'a> { self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8338,7 +8351,7 @@ impl<'a> FuncGen<'a> { // TODO: unclear if we need this? check other new insts with no stack ops // self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8385,7 +8398,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 2? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8419,7 +8432,7 @@ impl<'a> FuncGen<'a> { ); self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8462,7 +8475,7 @@ impl<'a> FuncGen<'a> { Location::GPR(GPR::RAX), ); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8503,7 +8516,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 2? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8551,7 +8564,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 3? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8590,7 +8603,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 3? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8620,7 +8633,7 @@ impl<'a> FuncGen<'a> { // TODO: should this be 3? self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8653,7 +8666,7 @@ impl<'a> FuncGen<'a> { // TODO: do we need this? //self.machine.release_locations_only_osr_state(1); - self.emit_call_sysv( + self.emit_call_native( |this| { this.assembler.emit_call_register(GPR::RAX); }, @@ -8799,6 +8812,10 @@ pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { stack_offset += 8; } } + #[cfg(target_os = "windows")] + let stack_padding: u32 = 32; + #[cfg(not(target_os = "windows"))] + let stack_padding: u32 = 0; // Align to 16 bytes. We push two 8-byte registers below, so here we need to ensure stack_offset % 16 == 8. if stack_offset % 16 != 8 { @@ -8812,7 +8829,7 @@ pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { // Prepare stack space. a.emit_sub( Size::S64, - Location::Imm32(stack_offset), + Location::Imm32(stack_offset + stack_padding), Location::GPR(GPR::RSP), ); @@ -8847,7 +8864,10 @@ pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { a.emit_mov( Size::S64, Location::GPR(GPR::RAX), - Location::Memory(GPR::RSP, (n_stack_args * 8) as _), + Location::Memory( + GPR::RSP, + (stack_padding as usize + n_stack_args * 8) as _, + ), ); n_stack_args += 1; } @@ -8862,7 +8882,7 @@ pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { // Restore stack. a.emit_add( Size::S64, - Location::Imm32(stack_offset), + Location::Imm32(stack_offset + stack_padding), Location::GPR(GPR::RSP), ); @@ -8896,9 +8916,13 @@ pub fn gen_std_dynamic_import_trampoline( // Allocate argument array. let stack_offset: usize = 16 * std::cmp::max(sig.params().len(), sig.results().len()) + 8; // 16 bytes each + 8 bytes sysv call padding + #[cfg(target_os = "windows")] + let stack_padding: usize = 32; + #[cfg(not(target_os = "windows"))] + let stack_padding: usize = 0; a.emit_sub( Size::S64, - Location::Imm32(stack_offset as _), + Location::Imm32((stack_offset + stack_padding) as _), Location::GPR(GPR::RSP), ); @@ -8916,7 +8940,10 @@ pub fn gen_std_dynamic_import_trampoline( None => { a.emit_mov( Size::S64, - Location::Memory(GPR::RSP, (stack_offset + 8 + stack_param_count * 8) as _), + Location::Memory( + GPR::RSP, + (stack_padding * 2 + stack_offset + 8 + stack_param_count * 8) as _, + ), Location::GPR(GPR::RAX), ); stack_param_count += 1; @@ -8926,19 +8953,29 @@ pub fn gen_std_dynamic_import_trampoline( a.emit_mov( Size::S64, source_loc, - Location::Memory(GPR::RSP, (i * 16) as _), + Location::Memory(GPR::RSP, (stack_padding + i * 16) as _), ); // Zero upper 64 bits. a.emit_mov( Size::S64, Location::Imm32(0), - Location::Memory(GPR::RSP, (i * 16 + 8) as _), + Location::Memory(GPR::RSP, (stack_padding + i * 16 + 8) as _), ); } } // Load target address. + #[cfg(target_os = "windows")] + a.emit_mov( + Size::S64, + Location::Memory( + GPR::RCX, + vmoffsets.vmdynamicfunction_import_context_address() as i32, + ), + Location::GPR(GPR::RAX), + ); + #[cfg(target_os = "linux")] a.emit_mov( Size::S64, Location::Memory( @@ -8949,6 +8986,13 @@ pub fn gen_std_dynamic_import_trampoline( ); // Load values array. + #[cfg(target_os = "windows")] + a.emit_lea( + Size::S64, + Location::Memory(GPR::RSP, stack_padding as i32), + Location::GPR(GPR::RDX), + ); + #[cfg(target_os = "linux")] a.emit_mov(Size::S64, Location::GPR(GPR::RSP), Location::GPR(GPR::RSI)); // Call target. @@ -8959,7 +9003,7 @@ pub fn gen_std_dynamic_import_trampoline( assert_eq!(sig.results().len(), 1); a.emit_mov( Size::S64, - Location::Memory(GPR::RSP, 0), + Location::Memory(GPR::RSP, stack_padding as i32), Location::GPR(GPR::RAX), ); } @@ -8967,7 +9011,7 @@ pub fn gen_std_dynamic_import_trampoline( // Release values array. a.emit_add( Size::S64, - Location::Imm32(stack_offset as _), + Location::Imm32((stack_offset + stack_padding) as _), Location::GPR(GPR::RSP), ); @@ -8990,12 +9034,45 @@ pub fn gen_import_call_trampoline( // TODO: ARM entry trampoline is not emitted. + // Singlepass internally treats all arguments as integers, but the standard Windows calling convention requires + // floating point arguments to be passed in XMM registers for the 4 first arguments only + // That's the only change to do, other arguments are not to be changed + #[cfg(target_os = "windows")] + if sig + .params() + .iter() + .any(|&x| x == Type::F32 || x == Type::F64) + { + let mut param_locations: Vec = vec![]; + for i in 0..sig.params().len() { + let loc = match i { + 0..=2 => { + static PARAM_REGS: &[GPR] = &[GPR::RDX, GPR::R8, GPR::R9]; + Location::GPR(PARAM_REGS[i]) + } + _ => Location::Memory(GPR::RSP, 32 + 8 + ((i-3) * 8) as i32), // will not be used anyway + }; + param_locations.push(loc); + } + // Copy Float arguments to XMM from GPR. + let mut argalloc = ArgumentRegisterAllocator::default(); + for (i, ty) in sig.params().iter().enumerate() { + let prev_loc = param_locations[i]; + match argalloc.next(*ty) { + Some(X64Register::GPR(_gpr)) => continue, + Some(X64Register::XMM(xmm)) => a.emit_mov(Size::S64, prev_loc, Location::XMM(xmm)), + None => continue, + }; + } + } + // Singlepass internally treats all arguments as integers, but the standard System V calling convention requires // floating point arguments to be passed in XMM registers. // // FIXME: This is only a workaround. We should fix singlepass to use the standard CC. // Translation is expensive, so only do it if needed. + #[cfg(not(target_os = "windows"))] if sig .params() .iter() @@ -9074,16 +9151,32 @@ pub fn gen_import_call_trampoline( let offset = vmoffsets.vmctx_vmfunction_import(index); - a.emit_mov( - Size::S64, - Location::Memory(GPR::RDI, offset as i32), // function pointer - Location::GPR(GPR::RAX), - ); - a.emit_mov( - Size::S64, - Location::Memory(GPR::RDI, offset as i32 + 8), // target vmctx - Location::GPR(GPR::RDI), - ); + #[cfg(target_os = "windows")] + { + a.emit_mov( + Size::S64, + Location::Memory(GPR::RCX, offset as i32), // function pointer + Location::GPR(GPR::RAX), + ); + a.emit_mov( + Size::S64, + Location::Memory(GPR::RCX, offset as i32 + 8), // target vmctx + Location::GPR(GPR::RCX), + ); + } + #[cfg(not(target_os = "windows"))] + { + a.emit_mov( + Size::S64, + Location::Memory(GPR::RDI, offset as i32), // function pointer + Location::GPR(GPR::RAX), + ); + a.emit_mov( + Size::S64, + Location::Memory(GPR::RDI, offset as i32 + 8), // target vmctx + Location::GPR(GPR::RDI), + ); + } a.emit_host_redirection(GPR::RAX); let section_body = SectionBody::new_with_vec(a.finalize().unwrap().to_vec()); diff --git a/lib/compiler-singlepass/src/compiler.rs b/lib/compiler-singlepass/src/compiler.rs index ea818264ba6..d470d5d08b3 100644 --- a/lib/compiler-singlepass/src/compiler.rs +++ b/lib/compiler-singlepass/src/compiler.rs @@ -57,11 +57,11 @@ impl Compiler for SinglepassCompiler { _module_translation: &ModuleTranslationState, function_body_inputs: PrimaryMap>, ) -> Result { - if target.triple().operating_system == OperatingSystem::Windows { + /*if target.triple().operating_system == OperatingSystem::Windows { return Err(CompileError::UnsupportedTarget( OperatingSystem::Windows.to_string(), )); - } + }*/ if let Architecture::X86_32(arch) = target.triple().architecture { return Err(CompileError::UnsupportedTarget(arch.to_string())); } @@ -219,13 +219,13 @@ mod tests { let compiler = SinglepassCompiler::new(Singlepass::default()); // Compile for win64 - let win64 = Target::new(triple!("x86_64-pc-windows-msvc"), CpuFeature::for_host()); + /*let win64 = Target::new(triple!("x86_64-pc-windows-msvc"), CpuFeature::for_host()); let (mut info, translation, inputs) = dummy_compilation_ingredients(); let result = compiler.compile_module(&win64, &mut info, &translation, inputs); match result.unwrap_err() { CompileError::UnsupportedTarget(name) => assert_eq!(name, "windows"), error => panic!("Unexpected error: {:?}", error), - }; + };*/ // Compile for 32bit Linux let linux32 = Target::new(triple!("i686-unknown-linux-gnu"), CpuFeature::for_host()); diff --git a/lib/compiler-singlepass/src/emitter_x64.rs b/lib/compiler-singlepass/src/emitter_x64.rs index b5f4f234920..04601febef3 100644 --- a/lib/compiler-singlepass/src/emitter_x64.rs +++ b/lib/compiler-singlepass/src/emitter_x64.rs @@ -1400,7 +1400,7 @@ impl Emitter for Assembler { } fn emit_bkpt(&mut self) { - dynasm!(self ; int 0x3); + dynasm!(self ; int3); } fn emit_host_redirection(&mut self, target: GPR) { diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs index 3cf01512f44..f16b3f7e771 100644 --- a/lib/compiler-singlepass/src/machine.rs +++ b/lib/compiler-singlepass/src/machine.rs @@ -521,6 +521,17 @@ impl Machine { } } + #[cfg(target_os = "windows")] + pub fn get_param_location(idx: usize) -> Location { + match idx { + 0 => Location::GPR(GPR::RCX), + 1 => Location::GPR(GPR::RDX), + 2 => Location::GPR(GPR::R8), + 3 => Location::GPR(GPR::R9), + _ => Location::Memory(GPR::RBP, (16 + 32 + (idx - 4) * 8) as i32), + } + } + #[cfg(not(target_os = "windows"))] pub fn get_param_location(idx: usize) -> Location { match idx { 0 => Location::GPR(GPR::RDI), diff --git a/lib/compiler-singlepass/src/x64_decl.rs b/lib/compiler-singlepass/src/x64_decl.rs index 935090694f7..0c20f5ae137 100644 --- a/lib/compiler-singlepass/src/x64_decl.rs +++ b/lib/compiler-singlepass/src/x64_decl.rs @@ -170,6 +170,37 @@ pub struct ArgumentRegisterAllocator { impl ArgumentRegisterAllocator { /// Allocates a register for argument type `ty`. Returns `None` if no register is available for this type. + #[cfg(target_os = "windows")] + pub fn next(&mut self, ty: Type) -> Option { + static GPR_SEQ: &'static [GPR] = &[GPR::RCX, GPR::RDX, GPR::R8, GPR::R9]; + static XMM_SEQ: &'static [XMM] = &[XMM::XMM0, XMM::XMM1, XMM::XMM2, XMM::XMM3]; + let idx = self.n_gprs + self.n_xmms; + match ty { + Type::I32 | Type::I64 => { + if idx < 4 { + let gpr = GPR_SEQ[idx]; + self.n_gprs += 1; + Some(X64Register::GPR(gpr)) + } else { + None + } + } + Type::F32 | Type::F64 => { + if idx < 4 { + let xmm = XMM_SEQ[idx]; + self.n_xmms += 1; + Some(X64Register::XMM(xmm)) + } else { + None + } + } + _ => todo!( + "ArgumentRegisterAllocator::next: Unsupported type: {:?}", + ty + ), + } + } + #[cfg(not(target_os = "windows"))] pub fn next(&mut self, ty: Type) -> Option { static GPR_SEQ: &'static [GPR] = &[GPR::RDI, GPR::RSI, GPR::RDX, GPR::RCX, GPR::R8, GPR::R9]; diff --git a/tests/ignores.txt b/tests/ignores.txt index a771b4418d5..cac014a355d 100644 --- a/tests/ignores.txt +++ b/tests/ignores.txt @@ -2,7 +2,6 @@ singlepass spec::multi_value # Singlepass has not implemented multivalue (functions that returns "structs"/"tuples") singlepass spec::simd # Singlepass doesn't support yet SIMD (no one asked for this feature) -singlepass+windows * # We might need to add support for Windows calling convention from host to wasm (Company showed interest to sponsor) singlepass+dylib * # It needs to add support for PIC in Singlepass. Not implemented at the moment windows+dylib * # This might be trivial to fix? musl+dylib * # Dynamic loading not supported in Musl From 7b27a812211448bc28d6c9e0578b8837c45d3895 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Wed, 15 Sep 2021 14:15:51 +0200 Subject: [PATCH 2/9] feat(compiler) fixed linting --- lib/compiler-singlepass/src/codegen_x64.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compiler-singlepass/src/codegen_x64.rs b/lib/compiler-singlepass/src/codegen_x64.rs index f337f7ea893..c4985a23f2a 100644 --- a/lib/compiler-singlepass/src/codegen_x64.rs +++ b/lib/compiler-singlepass/src/codegen_x64.rs @@ -9050,7 +9050,7 @@ pub fn gen_import_call_trampoline( static PARAM_REGS: &[GPR] = &[GPR::RDX, GPR::R8, GPR::R9]; Location::GPR(PARAM_REGS[i]) } - _ => Location::Memory(GPR::RSP, 32 + 8 + ((i-3) * 8) as i32), // will not be used anyway + _ => Location::Memory(GPR::RSP, 32 + 8 + ((i - 3) * 8) as i32), // will not be used anyway }; param_locations.push(loc); } From 2e5dae0019a717a1a525893e264568ac2ab6cd9c Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Thu, 16 Sep 2021 11:03:39 +0200 Subject: [PATCH 3/9] feat(compiler) Make CallingConvention a paramter in SinglePass --- lib/compiler-singlepass/src/codegen_x64.rs | 360 +++++++++++---------- lib/compiler-singlepass/src/compiler.rs | 28 +- lib/compiler-singlepass/src/config.rs | 12 +- lib/compiler-singlepass/src/machine.rs | 44 +-- lib/compiler-singlepass/src/x64_decl.rs | 129 ++++---- 5 files changed, 304 insertions(+), 269 deletions(-) diff --git a/lib/compiler-singlepass/src/codegen_x64.rs b/lib/compiler-singlepass/src/codegen_x64.rs index c4985a23f2a..3f3c48e2938 100644 --- a/lib/compiler-singlepass/src/codegen_x64.rs +++ b/lib/compiler-singlepass/src/codegen_x64.rs @@ -8,9 +8,9 @@ use wasmer_compiler::wasmparser::{ MemoryImmediate, Operator, Type as WpType, TypeOrFuncType as WpTypeOrFuncType, }; use wasmer_compiler::{ - CompiledFunction, CompiledFunctionFrameInfo, CustomSection, CustomSectionProtection, - FunctionBody, FunctionBodyData, InstructionAddressMap, Relocation, RelocationKind, - RelocationTarget, SectionBody, SectionIndex, SourceLoc, TrapInformation, + CallingConvention, CompiledFunction, CompiledFunctionFrameInfo, CustomSection, + CustomSectionProtection, FunctionBody, FunctionBodyData, InstructionAddressMap, Relocation, + RelocationKind, RelocationTarget, SectionBody, SectionIndex, SourceLoc, TrapInformation, }; use wasmer_types::{ entity::{EntityRef, PrimaryMap, SecondaryMap}, @@ -1010,17 +1010,18 @@ impl<'a> FuncGen<'a> { self.machine.state.stack_values.push(content); } } + let calling_convention = self.config.calling_convention; - #[cfg(target_os = "windows")] - let stack_padding: usize = 32; - #[cfg(not(target_os = "windows"))] - let stack_padding: usize = 0; + let stack_padding: usize = match calling_convention { + CallingConvention::WindowsFastcall => 32, + _ => 0, + }; let mut stack_offset: usize = 0; // Calculate stack offset. for (i, _param) in params.iter().enumerate() { - if let Location::Memory(_, _) = Machine::get_param_location(1 + i) { + if let Location::Memory(_, _) = Machine::get_param_location(1 + i, calling_convention) { stack_offset += 8; } } @@ -1043,10 +1044,9 @@ impl<'a> FuncGen<'a> { } let mut call_movs: Vec<(Location, GPR)> = vec![]; - // Prepare register & stack parameters. for (i, param) in params.iter().enumerate().rev() { - let loc = Machine::get_param_location(1 + i); + let loc = Machine::get_param_location(1 + i, calling_convention); match loc { Location::GPR(x) => { call_movs.push((*param, x)); @@ -1144,7 +1144,7 @@ impl<'a> FuncGen<'a> { self.assembler.emit_mov( Size::S64, Location::GPR(Machine::get_vmctx_reg()), - Machine::get_param_location(0), + Machine::get_param_location(0, calling_convention), ); // vmctx if (self.machine.state.stack_values.len() % 2) != 1 { @@ -1756,6 +1756,7 @@ impl<'a> FuncGen<'a> { &mut self.assembler, self.local_types.len(), self.signature.params().len(), + self.config.calling_convention, ); // Mark vmctx register. The actual loading of the vmctx value is handled by init_local. @@ -5403,6 +5404,7 @@ impl<'a> FuncGen<'a> { self.vmoffsets.vmcaller_checked_anyfunc_func_ptr() as usize; let vmcaller_checked_anyfunc_vmctx = self.vmoffsets.vmcaller_checked_anyfunc_vmctx() as usize; + let calling_convention = self.config.calling_convention; self.emit_call_native( |this| { @@ -5423,7 +5425,7 @@ impl<'a> FuncGen<'a> { this.assembler.emit_mov( Size::S64, Location::Memory(GPR::RAX, vmcaller_checked_anyfunc_vmctx as i32), - Machine::get_param_location(0), + Machine::get_param_location(0, calling_convention), ); this.assembler.emit_call_location(Location::Memory( @@ -8802,20 +8804,23 @@ fn sort_call_movs(movs: &mut [(Location, GPR)]) { } // Standard entry trampoline. -pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { +pub fn gen_std_trampoline( + sig: &FunctionType, + calling_convention: CallingConvention, +) -> FunctionBody { let mut a = Assembler::new().unwrap(); // Calculate stack offset. let mut stack_offset: u32 = 0; for (i, _param) in sig.params().iter().enumerate() { - if let Location::Memory(_, _) = Machine::get_param_location(1 + i) { + if let Location::Memory(_, _) = Machine::get_param_location(1 + i, calling_convention) { stack_offset += 8; } } - #[cfg(target_os = "windows")] - let stack_padding: u32 = 32; - #[cfg(not(target_os = "windows"))] - let stack_padding: u32 = 0; + let stack_padding: u32 = match calling_convention { + CallingConvention::WindowsFastcall => 32, + _ => 0, + }; // Align to 16 bytes. We push two 8-byte registers below, so here we need to ensure stack_offset % 16 == 8. if stack_offset % 16 != 8 { @@ -8836,12 +8841,12 @@ pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { // Arguments a.emit_mov( Size::S64, - Machine::get_param_location(1), + Machine::get_param_location(1, calling_convention), Location::GPR(GPR::R15), ); // func_ptr a.emit_mov( Size::S64, - Machine::get_param_location(2), + Machine::get_param_location(2, calling_convention), Location::GPR(GPR::R14), ); // args_rets @@ -8851,7 +8856,7 @@ pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { let mut n_stack_args: usize = 0; for (i, _param) in sig.params().iter().enumerate() { let src_loc = Location::Memory(GPR::R14, (i * 16) as _); // args_rets[i] - let dst_loc = Machine::get_param_location(1 + i); + let dst_loc = Machine::get_param_location(1 + i, calling_convention); match dst_loc { Location::GPR(_) => { @@ -8911,15 +8916,16 @@ pub fn gen_std_trampoline(sig: &FunctionType) -> FunctionBody { pub fn gen_std_dynamic_import_trampoline( vmoffsets: &VMOffsets, sig: &FunctionType, + calling_convention: CallingConvention, ) -> FunctionBody { let mut a = Assembler::new().unwrap(); // Allocate argument array. let stack_offset: usize = 16 * std::cmp::max(sig.params().len(), sig.results().len()) + 8; // 16 bytes each + 8 bytes sysv call padding - #[cfg(target_os = "windows")] - let stack_padding: usize = 32; - #[cfg(not(target_os = "windows"))] - let stack_padding: usize = 0; + let stack_padding: usize = match calling_convention { + CallingConvention::WindowsFastcall => 32, + _ => 0, + }; a.emit_sub( Size::S64, Location::Imm32((stack_offset + stack_padding) as _), @@ -8929,12 +8935,12 @@ pub fn gen_std_dynamic_import_trampoline( // Copy arguments. if !sig.params().is_empty() { let mut argalloc = ArgumentRegisterAllocator::default(); - argalloc.next(Type::I64).unwrap(); // skip VMContext + argalloc.next(Type::I64, calling_convention).unwrap(); // skip VMContext let mut stack_param_count: usize = 0; for (i, ty) in sig.params().iter().enumerate() { - let source_loc = match argalloc.next(*ty) { + let source_loc = match argalloc.next(*ty, calling_convention) { Some(X64Register::GPR(gpr)) => Location::GPR(gpr), Some(X64Register::XMM(xmm)) => Location::XMM(xmm), None => { @@ -8965,35 +8971,38 @@ pub fn gen_std_dynamic_import_trampoline( } } - // Load target address. - #[cfg(target_os = "windows")] - a.emit_mov( - Size::S64, - Location::Memory( - GPR::RCX, - vmoffsets.vmdynamicfunction_import_context_address() as i32, - ), - Location::GPR(GPR::RAX), - ); - #[cfg(target_os = "linux")] - a.emit_mov( - Size::S64, - Location::Memory( - GPR::RDI, - vmoffsets.vmdynamicfunction_import_context_address() as i32, - ), - Location::GPR(GPR::RAX), - ); - - // Load values array. - #[cfg(target_os = "windows")] - a.emit_lea( - Size::S64, - Location::Memory(GPR::RSP, stack_padding as i32), - Location::GPR(GPR::RDX), - ); - #[cfg(target_os = "linux")] - a.emit_mov(Size::S64, Location::GPR(GPR::RSP), Location::GPR(GPR::RSI)); + match calling_convention { + CallingConvention::WindowsFastcall => { + // Load target address. + a.emit_mov( + Size::S64, + Location::Memory( + GPR::RCX, + vmoffsets.vmdynamicfunction_import_context_address() as i32, + ), + Location::GPR(GPR::RAX), + ); + // Load values array. + a.emit_lea( + Size::S64, + Location::Memory(GPR::RSP, stack_padding as i32), + Location::GPR(GPR::RDX), + ); + } + _ => { + // Load target address. + a.emit_mov( + Size::S64, + Location::Memory( + GPR::RDI, + vmoffsets.vmdynamicfunction_import_context_address() as i32, + ), + Location::GPR(GPR::RAX), + ); + // Load values array. + a.emit_mov(Size::S64, Location::GPR(GPR::RSP), Location::GPR(GPR::RSI)); + } + }; // Call target. a.emit_call_location(Location::GPR(GPR::RAX)); @@ -9029,120 +9038,119 @@ pub fn gen_import_call_trampoline( vmoffsets: &VMOffsets, index: FunctionIndex, sig: &FunctionType, + calling_convention: CallingConvention, ) -> CustomSection { let mut a = Assembler::new().unwrap(); // TODO: ARM entry trampoline is not emitted. - // Singlepass internally treats all arguments as integers, but the standard Windows calling convention requires - // floating point arguments to be passed in XMM registers for the 4 first arguments only - // That's the only change to do, other arguments are not to be changed - #[cfg(target_os = "windows")] + // Singlepass internally treats all arguments as integers + // For the standard Windows calling convention requires + // floating point arguments to be passed in XMM registers for the 4 first arguments only + // That's the only change to do, other arguments are not to be changed + // For the standard System V calling convention requires + // floating point arguments to be passed in XMM registers. + // Translation is expensive, so only do it if needed. if sig .params() .iter() .any(|&x| x == Type::F32 || x == Type::F64) { - let mut param_locations: Vec = vec![]; - for i in 0..sig.params().len() { - let loc = match i { - 0..=2 => { - static PARAM_REGS: &[GPR] = &[GPR::RDX, GPR::R8, GPR::R9]; - Location::GPR(PARAM_REGS[i]) + match calling_convention { + CallingConvention::WindowsFastcall => { + let mut param_locations: Vec = vec![]; + for i in 0..sig.params().len() { + let loc = match i { + 0..=2 => { + static PARAM_REGS: &[GPR] = &[GPR::RDX, GPR::R8, GPR::R9]; + Location::GPR(PARAM_REGS[i]) + } + _ => Location::Memory(GPR::RSP, 32 + 8 + ((i - 3) * 8) as i32), // will not be used anyway + }; + param_locations.push(loc); } - _ => Location::Memory(GPR::RSP, 32 + 8 + ((i - 3) * 8) as i32), // will not be used anyway - }; - param_locations.push(loc); - } - // Copy Float arguments to XMM from GPR. - let mut argalloc = ArgumentRegisterAllocator::default(); - for (i, ty) in sig.params().iter().enumerate() { - let prev_loc = param_locations[i]; - match argalloc.next(*ty) { - Some(X64Register::GPR(_gpr)) => continue, - Some(X64Register::XMM(xmm)) => a.emit_mov(Size::S64, prev_loc, Location::XMM(xmm)), - None => continue, - }; - } - } - - // Singlepass internally treats all arguments as integers, but the standard System V calling convention requires - // floating point arguments to be passed in XMM registers. - // - // FIXME: This is only a workaround. We should fix singlepass to use the standard CC. + // Copy Float arguments to XMM from GPR. + let mut argalloc = ArgumentRegisterAllocator::default(); + for (i, ty) in sig.params().iter().enumerate() { + let prev_loc = param_locations[i]; + match argalloc.next(*ty, calling_convention) { + Some(X64Register::GPR(_gpr)) => continue, + Some(X64Register::XMM(xmm)) => { + a.emit_mov(Size::S64, prev_loc, Location::XMM(xmm)) + } + None => continue, + }; + } + } + _ => { + let mut param_locations: Vec = vec![]; - // Translation is expensive, so only do it if needed. - #[cfg(not(target_os = "windows"))] - if sig - .params() - .iter() - .any(|&x| x == Type::F32 || x == Type::F64) - { - let mut param_locations: Vec = vec![]; + // Allocate stack space for arguments. + let stack_offset: i32 = if sig.params().len() > 5 { + 5 * 8 + } else { + (sig.params().len() as i32) * 8 + }; + if stack_offset > 0 { + a.emit_sub( + Size::S64, + Location::Imm32(stack_offset as u32), + Location::GPR(GPR::RSP), + ); + } - // Allocate stack space for arguments. - let stack_offset: i32 = if sig.params().len() > 5 { - 5 * 8 - } else { - (sig.params().len() as i32) * 8 - }; - if stack_offset > 0 { - a.emit_sub( - Size::S64, - Location::Imm32(stack_offset as u32), - Location::GPR(GPR::RSP), - ); - } + // Store all arguments to the stack to prevent overwrite. + for i in 0..sig.params().len() { + let loc = match i { + 0..=4 => { + static PARAM_REGS: &[GPR] = + &[GPR::RSI, GPR::RDX, GPR::RCX, GPR::R8, GPR::R9]; + let loc = Location::Memory(GPR::RSP, (i * 8) as i32); + a.emit_mov(Size::S64, Location::GPR(PARAM_REGS[i]), loc); + loc + } + _ => Location::Memory(GPR::RSP, stack_offset + 8 + ((i - 5) * 8) as i32), + }; + param_locations.push(loc); + } - // Store all arguments to the stack to prevent overwrite. - for i in 0..sig.params().len() { - let loc = match i { - 0..=4 => { - static PARAM_REGS: &[GPR] = &[GPR::RSI, GPR::RDX, GPR::RCX, GPR::R8, GPR::R9]; - let loc = Location::Memory(GPR::RSP, (i * 8) as i32); - a.emit_mov(Size::S64, Location::GPR(PARAM_REGS[i]), loc); - loc + // Copy arguments. + let mut argalloc = ArgumentRegisterAllocator::default(); + argalloc.next(Type::I64, calling_convention).unwrap(); // skip VMContext + let mut caller_stack_offset: i32 = 0; + for (i, ty) in sig.params().iter().enumerate() { + let prev_loc = param_locations[i]; + let targ = match argalloc.next(*ty, calling_convention) { + Some(X64Register::GPR(gpr)) => Location::GPR(gpr), + Some(X64Register::XMM(xmm)) => Location::XMM(xmm), + None => { + // No register can be allocated. Put this argument on the stack. + // + // Since here we never use fewer registers than by the original call, on the caller's frame + // we always have enough space to store the rearranged arguments, and the copy "backward" between different + // slots in the caller argument region will always work. + a.emit_mov(Size::S64, prev_loc, Location::GPR(GPR::RAX)); + a.emit_mov( + Size::S64, + Location::GPR(GPR::RAX), + Location::Memory(GPR::RSP, stack_offset + 8 + caller_stack_offset), + ); + caller_stack_offset += 8; + continue; + } + }; + a.emit_mov(Size::S64, prev_loc, targ); } - _ => Location::Memory(GPR::RSP, stack_offset + 8 + ((i - 5) * 8) as i32), - }; - param_locations.push(loc); - } - // Copy arguments. - let mut argalloc = ArgumentRegisterAllocator::default(); - argalloc.next(Type::I64).unwrap(); // skip VMContext - let mut caller_stack_offset: i32 = 0; - for (i, ty) in sig.params().iter().enumerate() { - let prev_loc = param_locations[i]; - let target = match argalloc.next(*ty) { - Some(X64Register::GPR(gpr)) => Location::GPR(gpr), - Some(X64Register::XMM(xmm)) => Location::XMM(xmm), - None => { - // No register can be allocated. Put this argument on the stack. - // - // Since here we never use fewer registers than by the original call, on the caller's frame - // we always have enough space to store the rearranged arguments, and the copy "backward" between different - // slots in the caller argument region will always work. - a.emit_mov(Size::S64, prev_loc, Location::GPR(GPR::RAX)); - a.emit_mov( + // Restore stack pointer. + if stack_offset > 0 { + a.emit_add( Size::S64, - Location::GPR(GPR::RAX), - Location::Memory(GPR::RSP, stack_offset + 8 + caller_stack_offset), + Location::Imm32(stack_offset as u32), + Location::GPR(GPR::RSP), ); - caller_stack_offset += 8; - continue; } - }; - a.emit_mov(Size::S64, prev_loc, target); - } - - // Restore stack pointer. - if stack_offset > 0 { - a.emit_add( - Size::S64, - Location::Imm32(stack_offset as u32), - Location::GPR(GPR::RSP), - ); + } } } @@ -9151,31 +9159,31 @@ pub fn gen_import_call_trampoline( let offset = vmoffsets.vmctx_vmfunction_import(index); - #[cfg(target_os = "windows")] - { - a.emit_mov( - Size::S64, - Location::Memory(GPR::RCX, offset as i32), // function pointer - Location::GPR(GPR::RAX), - ); - a.emit_mov( - Size::S64, - Location::Memory(GPR::RCX, offset as i32 + 8), // target vmctx - Location::GPR(GPR::RCX), - ); - } - #[cfg(not(target_os = "windows"))] - { - a.emit_mov( - Size::S64, - Location::Memory(GPR::RDI, offset as i32), // function pointer - Location::GPR(GPR::RAX), - ); - a.emit_mov( - Size::S64, - Location::Memory(GPR::RDI, offset as i32 + 8), // target vmctx - Location::GPR(GPR::RDI), - ); + match calling_convention { + CallingConvention::WindowsFastcall => { + a.emit_mov( + Size::S64, + Location::Memory(GPR::RCX, offset as i32), // function pointer + Location::GPR(GPR::RAX), + ); + a.emit_mov( + Size::S64, + Location::Memory(GPR::RCX, offset as i32 + 8), // target vmctx + Location::GPR(GPR::RCX), + ); + } + _ => { + a.emit_mov( + Size::S64, + Location::Memory(GPR::RDI, offset as i32), // function pointer + Location::GPR(GPR::RAX), + ); + a.emit_mov( + Size::S64, + Location::Memory(GPR::RDI, offset as i32 + 8), // target vmctx + Location::GPR(GPR::RDI), + ); + } } a.emit_host_redirection(GPR::RAX); diff --git a/lib/compiler-singlepass/src/compiler.rs b/lib/compiler-singlepass/src/compiler.rs index d470d5d08b3..659cfcdefd4 100644 --- a/lib/compiler-singlepass/src/compiler.rs +++ b/lib/compiler-singlepass/src/compiler.rs @@ -12,10 +12,10 @@ use loupe::MemoryUsage; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use std::sync::Arc; use wasmer_compiler::{ - Architecture, Compilation, CompileError, CompileModuleInfo, CompiledFunction, Compiler, - CompilerConfig, FunctionBinaryReader, FunctionBody, FunctionBodyData, MiddlewareBinaryReader, - ModuleMiddleware, ModuleMiddlewareChain, ModuleTranslationState, OperatingSystem, SectionIndex, - Target, TrapInformation, + Architecture, CallingConvention, Compilation, CompileError, CompileModuleInfo, + CompiledFunction, Compiler, CompilerConfig, FunctionBinaryReader, FunctionBody, + FunctionBodyData, MiddlewareBinaryReader, ModuleMiddleware, ModuleMiddlewareChain, + ModuleTranslationState, OperatingSystem, SectionIndex, Target, TrapInformation, }; use wasmer_types::entity::{EntityRef, PrimaryMap}; use wasmer_types::{ @@ -68,6 +68,13 @@ impl Compiler for SinglepassCompiler { if compile_info.features.multi_value { return Err(CompileError::UnsupportedFeature("multivalue".to_string())); } + let calling_convention = match target.triple().default_calling_convention() { + Ok(CallingConvention::WindowsFastcall) => CallingConvention::WindowsFastcall, + Ok(CallingConvention::SystemV) => CallingConvention::SystemV, + //Ok(CallingConvention::AppleAarch64) => AppleAarch64, + _ => panic!("Unsupported Calling convention for Singlepass compiler"), + }; + let memory_styles = &compile_info.memory_styles; let table_styles = &compile_info.table_styles; let vmoffsets = VMOffsets::new(8, &compile_info.module); @@ -77,7 +84,12 @@ impl Compiler for SinglepassCompiler { .collect::>() .into_par_iter_if_rayon() .map(|i| { - gen_import_call_trampoline(&vmoffsets, i, &module.signatures[module.functions[i]]) + gen_import_call_trampoline( + &vmoffsets, + i, + &module.signatures[module.functions[i]], + calling_convention, + ) }) .collect::>() .into_iter() @@ -133,7 +145,7 @@ impl Compiler for SinglepassCompiler { .values() .collect::>() .into_par_iter_if_rayon() - .map(gen_std_trampoline) + .map(|func_type| gen_std_trampoline(&func_type, calling_convention)) .collect::>() .into_iter() .collect::>(); @@ -142,7 +154,9 @@ impl Compiler for SinglepassCompiler { .imported_function_types() .collect::>() .into_par_iter_if_rayon() - .map(|func_type| gen_std_dynamic_import_trampoline(&vmoffsets, &func_type)) + .map(|func_type| { + gen_std_dynamic_import_trampoline(&vmoffsets, &func_type, calling_convention) + }) .collect::>() .into_iter() .collect::>(); diff --git a/lib/compiler-singlepass/src/config.rs b/lib/compiler-singlepass/src/config.rs index 78496e83d40..c3fa980dbf3 100644 --- a/lib/compiler-singlepass/src/config.rs +++ b/lib/compiler-singlepass/src/config.rs @@ -4,7 +4,9 @@ use crate::compiler::SinglepassCompiler; use loupe::MemoryUsage; use std::sync::Arc; -use wasmer_compiler::{Compiler, CompilerConfig, CpuFeature, ModuleMiddleware, Target}; +use wasmer_compiler::{ + CallingConvention, Compiler, CompilerConfig, CpuFeature, ModuleMiddleware, Target, +}; use wasmer_types::Features; #[derive(Debug, Clone, MemoryUsage)] @@ -13,6 +15,8 @@ pub struct Singlepass { pub(crate) enable_stack_check: bool, /// The middleware chain. pub(crate) middlewares: Vec>, + #[loupe(skip)] + pub(crate) calling_convention: CallingConvention, } impl Singlepass { @@ -23,6 +27,12 @@ impl Singlepass { enable_nan_canonicalization: true, enable_stack_check: false, middlewares: vec![], + calling_convention: match Target::default().triple().default_calling_convention() { + Ok(CallingConvention::WindowsFastcall) => CallingConvention::WindowsFastcall, + Ok(CallingConvention::SystemV) => CallingConvention::SystemV, + //Ok(CallingConvention::AppleAarch64) => AppleAarch64, + _ => panic!("Unsupported Calling convention for Singlepass"), + }, } } diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs index f16b3f7e771..2ecbf245aaf 100644 --- a/lib/compiler-singlepass/src/machine.rs +++ b/lib/compiler-singlepass/src/machine.rs @@ -6,6 +6,7 @@ use smallvec::SmallVec; use std::cmp; use std::collections::HashSet; use wasmer_compiler::wasmparser::Type as WpType; +use wasmer_compiler::{CallingConvention, Target}; const NATIVE_PAGE_SIZE: usize = 4096; @@ -330,6 +331,7 @@ impl Machine { a: &mut E, n: usize, n_params: usize, + calling_convention: CallingConvention, ) -> Vec { // Determine whether a local should be allocated on the stack. fn is_local_on_stack(idx: usize) -> bool { @@ -432,7 +434,7 @@ impl Machine { // Locals are allocated on the stack from higher address to lower address, // so we won't skip the stack guard page here. for i in 0..n_params { - let loc = Self::get_param_location(i + 1); + let loc = Self::get_param_location(i + 1, calling_convention); match loc { Location::GPR(_) => { a.emit_mov(Size::S64, loc, locations[i]); @@ -454,7 +456,7 @@ impl Machine { // Load vmctx into R15. a.emit_mov( Size::S64, - Self::get_param_location(0), + Self::get_param_location(0, calling_convention), Location::GPR(GPR::R15), ); @@ -521,26 +523,24 @@ impl Machine { } } - #[cfg(target_os = "windows")] - pub fn get_param_location(idx: usize) -> Location { - match idx { - 0 => Location::GPR(GPR::RCX), - 1 => Location::GPR(GPR::RDX), - 2 => Location::GPR(GPR::R8), - 3 => Location::GPR(GPR::R9), - _ => Location::Memory(GPR::RBP, (16 + 32 + (idx - 4) * 8) as i32), - } - } - #[cfg(not(target_os = "windows"))] - pub fn get_param_location(idx: usize) -> Location { - match idx { - 0 => Location::GPR(GPR::RDI), - 1 => Location::GPR(GPR::RSI), - 2 => Location::GPR(GPR::RDX), - 3 => Location::GPR(GPR::RCX), - 4 => Location::GPR(GPR::R8), - 5 => Location::GPR(GPR::R9), - _ => Location::Memory(GPR::RBP, (16 + (idx - 6) * 8) as i32), + pub fn get_param_location(idx: usize, calling_convention: CallingConvention) -> Location { + match calling_convention { + CallingConvention::WindowsFastcall => match idx { + 0 => Location::GPR(GPR::RCX), + 1 => Location::GPR(GPR::RDX), + 2 => Location::GPR(GPR::R8), + 3 => Location::GPR(GPR::R9), + _ => Location::Memory(GPR::RBP, (16 + 32 + (idx - 4) * 8) as i32), + }, + _ => match idx { + 0 => Location::GPR(GPR::RDI), + 1 => Location::GPR(GPR::RSI), + 2 => Location::GPR(GPR::RDX), + 3 => Location::GPR(GPR::RCX), + 4 => Location::GPR(GPR::R8), + 5 => Location::GPR(GPR::R9), + _ => Location::Memory(GPR::RBP, (16 + (idx - 6) * 8) as i32), + }, } } } diff --git a/lib/compiler-singlepass/src/x64_decl.rs b/lib/compiler-singlepass/src/x64_decl.rs index 0c20f5ae137..5ca31900543 100644 --- a/lib/compiler-singlepass/src/x64_decl.rs +++ b/lib/compiler-singlepass/src/x64_decl.rs @@ -2,6 +2,7 @@ use crate::common_decl::{MachineState, MachineValue, RegisterIndex}; use std::collections::BTreeMap; +use wasmer_compiler::{CallingConvention, Target}; use wasmer_types::Type; /// General-purpose registers. @@ -170,73 +171,75 @@ pub struct ArgumentRegisterAllocator { impl ArgumentRegisterAllocator { /// Allocates a register for argument type `ty`. Returns `None` if no register is available for this type. - #[cfg(target_os = "windows")] - pub fn next(&mut self, ty: Type) -> Option { - static GPR_SEQ: &'static [GPR] = &[GPR::RCX, GPR::RDX, GPR::R8, GPR::R9]; - static XMM_SEQ: &'static [XMM] = &[XMM::XMM0, XMM::XMM1, XMM::XMM2, XMM::XMM3]; - let idx = self.n_gprs + self.n_xmms; - match ty { - Type::I32 | Type::I64 => { - if idx < 4 { - let gpr = GPR_SEQ[idx]; - self.n_gprs += 1; - Some(X64Register::GPR(gpr)) - } else { - None + pub fn next(&mut self, ty: Type, calling_convention: CallingConvention) -> Option { + match calling_convention { + CallingConvention::WindowsFastcall => { + static GPR_SEQ: &'static [GPR] = &[GPR::RCX, GPR::RDX, GPR::R8, GPR::R9]; + static XMM_SEQ: &'static [XMM] = &[XMM::XMM0, XMM::XMM1, XMM::XMM2, XMM::XMM3]; + let idx = self.n_gprs + self.n_xmms; + match ty { + Type::I32 | Type::I64 => { + if idx < 4 { + let gpr = GPR_SEQ[idx]; + self.n_gprs += 1; + Some(X64Register::GPR(gpr)) + } else { + None + } + } + Type::F32 | Type::F64 => { + if idx < 4 { + let xmm = XMM_SEQ[idx]; + self.n_xmms += 1; + Some(X64Register::XMM(xmm)) + } else { + None + } + } + _ => todo!( + "ArgumentRegisterAllocator::next: Unsupported type: {:?}", + ty + ), } } - Type::F32 | Type::F64 => { - if idx < 4 { - let xmm = XMM_SEQ[idx]; - self.n_xmms += 1; - Some(X64Register::XMM(xmm)) - } else { - None + _ => { + static GPR_SEQ: &'static [GPR] = + &[GPR::RDI, GPR::RSI, GPR::RDX, GPR::RCX, GPR::R8, GPR::R9]; + static XMM_SEQ: &'static [XMM] = &[ + XMM::XMM0, + XMM::XMM1, + XMM::XMM2, + XMM::XMM3, + XMM::XMM4, + XMM::XMM5, + XMM::XMM6, + XMM::XMM7, + ]; + match ty { + Type::I32 | Type::I64 => { + if self.n_gprs < GPR_SEQ.len() { + let gpr = GPR_SEQ[self.n_gprs]; + self.n_gprs += 1; + Some(X64Register::GPR(gpr)) + } else { + None + } + } + Type::F32 | Type::F64 => { + if self.n_xmms < XMM_SEQ.len() { + let xmm = XMM_SEQ[self.n_xmms]; + self.n_xmms += 1; + Some(X64Register::XMM(xmm)) + } else { + None + } + } + _ => todo!( + "ArgumentRegisterAllocator::next: Unsupported type: {:?}", + ty + ), } } - _ => todo!( - "ArgumentRegisterAllocator::next: Unsupported type: {:?}", - ty - ), - } - } - #[cfg(not(target_os = "windows"))] - pub fn next(&mut self, ty: Type) -> Option { - static GPR_SEQ: &'static [GPR] = - &[GPR::RDI, GPR::RSI, GPR::RDX, GPR::RCX, GPR::R8, GPR::R9]; - static XMM_SEQ: &'static [XMM] = &[ - XMM::XMM0, - XMM::XMM1, - XMM::XMM2, - XMM::XMM3, - XMM::XMM4, - XMM::XMM5, - XMM::XMM6, - XMM::XMM7, - ]; - match ty { - Type::I32 | Type::I64 => { - if self.n_gprs < GPR_SEQ.len() { - let gpr = GPR_SEQ[self.n_gprs]; - self.n_gprs += 1; - Some(X64Register::GPR(gpr)) - } else { - None - } - } - Type::F32 | Type::F64 => { - if self.n_xmms < XMM_SEQ.len() { - let xmm = XMM_SEQ[self.n_xmms]; - self.n_xmms += 1; - Some(X64Register::XMM(xmm)) - } else { - None - } - } - _ => todo!( - "ArgumentRegisterAllocator::next: Unsupported type: {:?}", - ty - ), } } } From c7544dcea49ab164c5495fc093dd5ebb34ea10d1 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Thu, 16 Sep 2021 11:17:47 +0200 Subject: [PATCH 4/9] feat(compiler) Fix lint --- lib/compiler-singlepass/src/machine.rs | 2 +- lib/compiler-singlepass/src/x64_decl.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs index 2ecbf245aaf..a4010f40f8e 100644 --- a/lib/compiler-singlepass/src/machine.rs +++ b/lib/compiler-singlepass/src/machine.rs @@ -6,7 +6,7 @@ use smallvec::SmallVec; use std::cmp; use std::collections::HashSet; use wasmer_compiler::wasmparser::Type as WpType; -use wasmer_compiler::{CallingConvention, Target}; +use wasmer_compiler::CallingConvention; const NATIVE_PAGE_SIZE: usize = 4096; diff --git a/lib/compiler-singlepass/src/x64_decl.rs b/lib/compiler-singlepass/src/x64_decl.rs index 5ca31900543..1a322821fa4 100644 --- a/lib/compiler-singlepass/src/x64_decl.rs +++ b/lib/compiler-singlepass/src/x64_decl.rs @@ -2,7 +2,7 @@ use crate::common_decl::{MachineState, MachineValue, RegisterIndex}; use std::collections::BTreeMap; -use wasmer_compiler::{CallingConvention, Target}; +use wasmer_compiler::CallingConvention; use wasmer_types::Type; /// General-purpose registers. From 9c9c0beb6bbb071ba752d96a1e73d0dc96d4c6d0 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Thu, 16 Sep 2021 12:16:18 +0200 Subject: [PATCH 5/9] feat(compiler) Adjust test for unsupported platforms --- lib/compiler-singlepass/src/compiler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compiler-singlepass/src/compiler.rs b/lib/compiler-singlepass/src/compiler.rs index 659cfcdefd4..f47ac7019c2 100644 --- a/lib/compiler-singlepass/src/compiler.rs +++ b/lib/compiler-singlepass/src/compiler.rs @@ -255,7 +255,7 @@ mod tests { let (mut info, translation, inputs) = dummy_compilation_ingredients(); let result = compiler.compile_module(&win32, &mut info, &translation, inputs); match result.unwrap_err() { - CompileError::UnsupportedTarget(name) => assert_eq!(name, "windows"), // Windows should be checked before architecture + CompileError::UnsupportedTarget(name) => assert_eq!(name, "i686"), // Windows should be checked before architecture error => panic!("Unexpected error: {:?}", error), }; } From c914b482d56156abdf803aa487d1c136e456a5be Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Fri, 17 Sep 2021 11:25:14 +0200 Subject: [PATCH 6/9] feat(compiler) Singlepass+Windows skip_stack_guard_pages also needs investigation, skip the test for now --- tests/ignores.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ignores.txt b/tests/ignores.txt index cac014a355d..d5f845f587f 100644 --- a/tests/ignores.txt +++ b/tests/ignores.txt @@ -46,6 +46,7 @@ llvm+dylib+macos+aarch64 * # Tests seem to be randomly failing # https://github.com/rust-lang/backtrace-rs/issues/356 cranelift+aarch64 spec::skip_stack_guard_page # This is skipped for ARM, not fully fixed yet llvm+aarch64 spec::skip_stack_guard_page # This is skipped for ARM, not fully fixed yet +singlepass+windows spec::skip_stack_guard_page # Needs investigation. cranelift+windows spec::skip_stack_guard_page # Needs investigation. Issue: `STATUS_ACCESS_VIOLATION` trap happened cranelift+macos spec::skip_stack_guard_page # Needs investigation. process didn't exit successfully: (signal: 6, SIGABRT: process abort signal) llvm+macos spec::skip_stack_guard_page # Needs investigation. process didn't exit successfully: (signal: 6, SIGABRT: process abort signal) From 45f56d00c0e362fa76b1557b702ba1dd693f2a04 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Fri, 17 Sep 2021 11:30:11 +0200 Subject: [PATCH 7/9] feat(compiler) Windows ABI needs RSI and RDI to be saved.So save them uncoditionnaly for now --- lib/compiler-singlepass/src/codegen_x64.rs | 2 +- lib/compiler-singlepass/src/machine.rs | 35 +++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/lib/compiler-singlepass/src/codegen_x64.rs b/lib/compiler-singlepass/src/codegen_x64.rs index 3f3c48e2938..2167578f0a6 100644 --- a/lib/compiler-singlepass/src/codegen_x64.rs +++ b/lib/compiler-singlepass/src/codegen_x64.rs @@ -6696,7 +6696,7 @@ impl<'a> FuncGen<'a> { if self.control_stack.is_empty() { self.assembler.emit_label(frame.label); self.machine - .finalize_locals(&mut self.assembler, &self.locals); + .finalize_locals(&mut self.assembler, &self.locals, self.config.calling_convention); self.assembler.emit_mov( Size::S64, Location::GPR(GPR::RBP), diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs index a4010f40f8e..d59806cf0bb 100644 --- a/lib/compiler-singlepass/src/machine.rs +++ b/lib/compiler-singlepass/src/machine.rs @@ -369,6 +369,11 @@ impl Machine { // Callee-saved R15 for vmctx. static_area_size += 8; + // For Windows ABI, save RDI and RSI + if calling_convention == CallingConvention::WindowsFastcall { + static_area_size += 8 * 2; + } + // Total size of callee saved registers. let callee_saved_regs_size = static_area_size; @@ -413,6 +418,29 @@ impl Machine { X64Register::GPR(GPR::R15).to_index(), )); + if calling_convention == CallingConvention::WindowsFastcall { + // Save RDI + self.stack_offset.0 += 8; + a.emit_mov( + Size::S64, + Location::GPR(GPR::RDI), + Location::Memory(GPR::RBP, -(self.stack_offset.0 as i32)), + ); + self.state.stack_values.push(MachineValue::PreserveRegister( + X64Register::GPR(GPR::RDI).to_index(), + )); + // Save RSI + self.stack_offset.0 += 8; + a.emit_mov( + Size::S64, + Location::GPR(GPR::RSI), + Location::Memory(GPR::RBP, -(self.stack_offset.0 as i32)), + ); + self.state.stack_values.push(MachineValue::PreserveRegister( + X64Register::GPR(GPR::RSI).to_index(), + )); + } + // Save the offset of register save area. self.save_area_offset = Some(MachineStackOffset(self.stack_offset.0)); @@ -501,7 +529,7 @@ impl Machine { locations } - pub fn finalize_locals(&mut self, a: &mut E, locations: &[Location]) { + pub fn finalize_locals(&mut self, a: &mut E, locations: &[Location], calling_convention: CallingConvention) { // Unwind stack to the "save area". a.emit_lea( Size::S64, @@ -512,6 +540,11 @@ impl Machine { Location::GPR(GPR::RSP), ); + if calling_convention == CallingConvention::WindowsFastcall { + // Restore RSI and RDI + a.emit_pop(Size::S64, Location::GPR(GPR::RSI)); + a.emit_pop(Size::S64, Location::GPR(GPR::RDI)); + } // Restore R15 used by vmctx. a.emit_pop(Size::S64, Location::GPR(GPR::R15)); From ca5dec1a29656aba96a283b7b301783ffd23e8f9 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Fri, 17 Sep 2021 11:31:10 +0200 Subject: [PATCH 8/9] feat(compiler) Use special __intrinsec_setjmp instead of regular setjmp on Windows to avoid potential issue with jitted code --- lib/vm/src/trap/handlers.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/vm/src/trap/handlers.c b/lib/vm/src/trap/handlers.c index ddee4b0c2f2..015e67cdc87 100644 --- a/lib/vm/src/trap/handlers.c +++ b/lib/vm/src/trap/handlers.c @@ -14,7 +14,11 @@ // doesn't need to touch the kernel signal handling routines. // In case of macOS, stackoverflow #if defined(CFG_TARGET_OS_WINDOWS) -#define platform_setjmp(buf) setjmp(buf) +// On Windows, default setjmp/longjmp sequence will try to unwind the stack +// it's fine most of the time, but not for JIT'd code that may not respect stack ordring +// Using a special setjmp here, with NULL as second parameter to disable that behaviour +// and have a regular simple setjmp/longjmp sequence +#define platform_setjmp(buf) __intrinsic_setjmp(buf, NULL) #define platform_longjmp(buf, arg) longjmp(buf, arg) #define platform_jmp_buf jmp_buf #elif defined(CFG_TARGET_OS_MACOS) From d11ef790867e9a23137bb39d68110f1178409757 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Fri, 17 Sep 2021 11:36:32 +0200 Subject: [PATCH 9/9] Fix linting --- lib/compiler-singlepass/src/codegen_x64.rs | 7 +++++-- lib/compiler-singlepass/src/machine.rs | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/compiler-singlepass/src/codegen_x64.rs b/lib/compiler-singlepass/src/codegen_x64.rs index 2167578f0a6..66f1f96ea05 100644 --- a/lib/compiler-singlepass/src/codegen_x64.rs +++ b/lib/compiler-singlepass/src/codegen_x64.rs @@ -6695,8 +6695,11 @@ impl<'a> FuncGen<'a> { if self.control_stack.is_empty() { self.assembler.emit_label(frame.label); - self.machine - .finalize_locals(&mut self.assembler, &self.locals, self.config.calling_convention); + self.machine.finalize_locals( + &mut self.assembler, + &self.locals, + self.config.calling_convention, + ); self.assembler.emit_mov( Size::S64, Location::GPR(GPR::RBP), diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs index d59806cf0bb..73f75bc7be0 100644 --- a/lib/compiler-singlepass/src/machine.rs +++ b/lib/compiler-singlepass/src/machine.rs @@ -529,7 +529,12 @@ impl Machine { locations } - pub fn finalize_locals(&mut self, a: &mut E, locations: &[Location], calling_convention: CallingConvention) { + pub fn finalize_locals( + &mut self, + a: &mut E, + locations: &[Location], + calling_convention: CallingConvention, + ) { // Unwind stack to the "save area". a.emit_lea( Size::S64,