Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix for #2721 #2723

Merged
merged 2 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions lib/compiler-singlepass/src/machine_x64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1807,15 +1807,11 @@ impl Machine for MachineX86_64 {
fn push_location_for_native(&mut self, loc: Location) {
match loc {
Location::Imm64(_) => {
// Dummy value slot to be filled with `mov`.
self.assembler.emit_push(Size::S64, Location::GPR(GPR::RAX));

// Use R9 as the temporary register here, since:
// - It is a temporary register that is not used for any persistent value.
// - This register as an argument location is only written to after `sort_call_movs`.'
// Push R9 value slot to be exchange with `mov`.
self.assembler.emit_push(Size::S64, Location::GPR(GPR::R9));
self.reserve_unused_temp_gpr(GPR::R9);
self.move_location(Size::S64, loc, Location::GPR(GPR::R9));
self.move_location(
self.assembler.emit_xchg(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On x86, the xchg instruction always does an atomic swap, which is very slow. Try to use a normal mov instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know, but I need to save the value of the register.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably cheaper to add another pair of push/pop for r9 than to use xchg:

push rax (dummy)
push r9
mov r9, #imm64
mov [sp + 8], r9
pop r9

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it's not.
I have done this simple program:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>


int main(int argc, const char* argv)
{
    struct timespec t1, t2;

    clock_gettime(CLOCK_MONOTONIC, &t1);
    #define NUM 100000
    for(int i=0; i<NUM; ++i)
        asm volatile ("\n\
            push %%r9\n\
            mov $0x123456789abcdef, %%r9\n\
            xchg (%%rsp), %%r9\n\
            pop  %%r9\n\
        "::);
    clock_gettime(CLOCK_MONOTONIC, &t2);
    uint64_t t = t1.tv_sec*1000000000LL + t1.tv_nsec;
    t = (t2.tv_sec*1000000000LL + t2.tv_nsec) - t;
    printf("Xchg: %llu nsec for %d occurences\n", t, NUM);
    for(int i=0; i<NUM; ++i)
        asm volatile ("\n\
            push %%rax\n\
            push %%r9\n\
            mov $0x123456789abcdef, %%r9\n\
            mov 0x8(%%rsp), %%r9\n\
            pop  %%r9\n\
            pop %%rax\n\
        "::);
    clock_gettime(CLOCK_MONOTONIC, &t2);
    t = t1.tv_sec*1000000000LL + t1.tv_nsec;
    t = (t2.tv_sec*1000000000LL + t2.tv_nsec) - t;
    printf("Push: %llu nsec for %d occurences\n", t, NUM);
}

And while I have some variation, the xchg method is always faster:

seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1911800 nsec for 100000 occurences
Push: 2216700 nsec for 100000 occurences
seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1156300 nsec for 100000 occurences
Push: 1452800 nsec for 100000 occurences
seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1156000 nsec for 100000 occurences
Push: 1452800 nsec for 100000 occurences
seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1345700 nsec for 100000 occurences
Push: 1668700 nsec for 100000 occurences
seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1499400 nsec for 100000 occurences
Push: 1898400 nsec for 100000 occurences
seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1185400 nsec for 100000 occurences
Push: 1493200 nsec for 100000 occurences
seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1276400 nsec for 100000 occurences
Push: 1598700 nsec for 100000 occurences
seb@Seb-PC:~/git/wasmer$ ./test
Xchg: 1276800 nsec for 100000 occurences
Push: 1596000 nsec for 100000 occurences

The implicit LOCK is not that slow, compare to a write+read in memory that the push/pop.

Size::S64,
Location::GPR(GPR::R9),
Location::Memory(GPR::RSP, 0),
Expand Down
149 changes: 149 additions & 0 deletions tests/compilers/issues.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,152 @@ fn issue_2329(mut config: crate::Config) -> Result<()> {
instance.exports.get_function("read_memory")?.call(&[])?;
Ok(())
}

#[compiler_test(issues)]
fn issue_2721(mut config: crate::Config) -> Result<()> {
let store = config.store();
let memory = Memory::new(
&store,
MemoryType::new(Pages(1024), Some(Pages(2048)), false),
)
.unwrap();

#[derive(Clone, WasmerEnv)]
pub struct Env {
memory: Memory,
}

pub fn banana(
env: &Env,
a: u64,
b: u64,
c: u64,
d: u64,
e: u64,
f: u64,
g: u64,
h: u64,
) -> u64 {
println!("{:?}", (a, b, c, d, e, f, g, h));
let view = env.memory.view::<u8>();
let bytes = view
.get(e as usize..(e + d) as usize)
.unwrap()
.into_iter()
.map(|b| b.get())
.collect::<Vec<u8>>();
let input_string = std::str::from_utf8(&bytes).unwrap();
assert_eq!(input_string, "bananapeach");
0
}

pub fn mango(env: &Env, a: u64) {}

pub fn pineapple(env: &Env, a: u64) -> u64 {
0
}

pub fn peach(env: &Env, a: u64, b: u64) -> u64 {
0
}

pub fn kiwi(env: &Env, a: u32) {}

let wat = r#"
(module
(type (;0;) (func (param i64)))
(type (;1;) (func (param i64) (result i64)))
(type (;2;) (func (param i64 i64) (result i64)))
(type (;3;) (func (param i64 i64 i64 i64 i64 i64 i64 i64) (result i64)))
(type (;4;) (func))
(import "env" "mango" (func (;0;) (type 0)))
(import "env" "pineapple" (func (;1;) (type 1)))
(import "env" "peach" (func (;2;) (type 2)))
(import "env" "banana" (func (;3;) (type 3)))
(import "env" "memory" (memory (;0;) 1024 2048))
(func (;4;) (type 4)
(local i32 i64)
global.get 0
i32.const 32
i32.sub
local.tee 0
global.set 0
local.get 0
i32.const 8
i32.add
i64.const 0
i64.store
local.get 0
i64.const 0
i64.store
i64.const 0
call 0
i64.const 0
call 1
local.set 1
local.get 0
i64.const 0
i64.store offset=24
local.get 0
i64.const 0
i64.store offset=16
i64.const 0
i64.const 0
call 2
local.get 1
local.get 0
i64.extend_i32_u
i64.const 11
i32.const 1048576
i64.extend_i32_u
i64.const 0
i64.const 0
local.get 0
i32.const 16
i32.add
i64.extend_i32_u
call 3
return)
(global (;0;) (mut i32) (i32.const 1048576))
(global (;1;) i32 (i32.const 1048587))
(global (;2;) i32 (i32.const 1048592))
(global (;3;) (mut i32) (i32.const 0))
(export "memory" (memory 0))
(export "repro" (func 4))
(export "__data_end" (global 1))
(export "__heap_base" (global 2))
(data (;0;) (i32.const 1048576) "bananapeach"))
"#;

let module = Module::new(&store, wat)?;
let env = Env {
memory: memory.clone(),
};
let mut exports = Exports::new();
exports.insert("memory", memory);
exports.insert(
"banana",
Function::new_native_with_env(&store, env.clone(), banana),
);
exports.insert(
"peach",
Function::new_native_with_env(&store, env.clone(), peach),
);
exports.insert(
"pineapple",
Function::new_native_with_env(&store, env.clone(), pineapple),
);
exports.insert(
"mango",
Function::new_native_with_env(&store, env.clone(), mango),
);
exports.insert(
"kiwi",
Function::new_native_with_env(&store, env.clone(), kiwi),
);
let mut imports = ImportObject::new();
imports.register("env", exports);
let instance = Instance::new(&module, &imports)?;
instance.exports.get_function("repro")?.call(&[])?;
Ok(())
}