Skip to content

Commit

Permalink
auto merge of #12172 : alexcrichton/rust/green-improvements, r=brson
Browse files Browse the repository at this point in the history
These commits pick off some low-hanging fruit which were slowing down spawning green threads. The major speedup comes from fixing a bug in stack caching where we never used any cached stacks!

The program I used to benchmark is at the end. It was compiled with `rustc --opt-level=3 bench.rs --test` and run as `RUST_THREADS=1 ./bench --bench`. I chose to use `RUST_THREADS=1` due to #11730 as the profiles I was getting interfered too much when all the schedulers were in play (and shouldn't be after #11730 is fixed). All of the units below are in ns/iter as reported by `--bench` (lower is better).

|               | green | native | raw    |
| ------------- | ----- | ------ | ------ |
| osx before    | 12699 | 24030  | 19734  |
| linux before  | 10223 | 125983 | 122647 |
| osx after     |  3847 | 25771  | 20835  |
| linux after   |  2631 | 135398 | 122765 |

Note that this is *not* a benchmark of spawning green tasks vs native tasks. I put in the native numbers just to get a ballpark of where green tasks are. This is benchmark is *clearly* benefiting from stack caching. Also, OSX is clearly not 5x faster than linux, I think my VM is just much slower.

All in all, this ended up being a nice 4x speedup for spawning a green task when you're using a cached stack.

```rust
extern mod extra;
extern mod native;
use std::rt::thread::Thread;

#[bench]
fn green(bh: &mut extra::test::BenchHarness) {
    let (p, c) = SharedChan::new();
    bh.iter(|| {
        let c = c.clone();
        spawn(proc() {
            c.send(());
        });
        p.recv();
    });
}

#[bench]
fn native(bh: &mut extra::test::BenchHarness) {
    let (p, c) = SharedChan::new();
    bh.iter(|| {
        let c = c.clone();
        native::task::spawn(proc() {
            c.send(());
        });
        p.recv();
    });
}

#[bench]
fn raw(bh: &mut extra::test::BenchHarness) {
    bh.iter(|| {
        Thread::start(proc() {}).join()
    });
}
```
  • Loading branch information
bors committed Feb 14, 2014
2 parents 68129d2 + 301ff0c commit 22c34f3
Show file tree
Hide file tree
Showing 13 changed files with 231 additions and 141 deletions.
2 changes: 1 addition & 1 deletion mk/crates.mk
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ TOOLS := compiletest rustdoc rustc

DEPS_std := native:rustrt native:compiler-rt
DEPS_extra := std term sync serialize getopts collections
DEPS_green := std
DEPS_green := std native:context_switch
DEPS_rustuv := std native:uv native:uv_support
DEPS_native := std
DEPS_syntax := std extra term serialize collections
Expand Down
5 changes: 3 additions & 2 deletions mk/rt.mk
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
# that's per-target so you're allowed to conditionally add files based on the
# target.
################################################################################
NATIVE_LIBS := rustrt sundown uv_support morestack miniz
NATIVE_LIBS := rustrt sundown uv_support morestack miniz context_switch

# $(1) is the target triple
define NATIVE_LIBRARIES
Expand All @@ -54,9 +54,10 @@ NATIVE_DEPS_rustrt_$(1) := rust_builtin.c \
rust_android_dummy.c \
rust_test_helpers.c \
rust_try.ll \
arch/$$(HOST_$(1))/_context.S \
arch/$$(HOST_$(1))/record_sp.S
NATIVE_DEPS_morestack_$(1) := arch/$$(HOST_$(1))/morestack.S
NATIVE_DEPS_context_switch_$(1) := \
arch/$$(HOST_$(1))/_context.S

################################################################################
# You shouldn't find it that necessary to edit anything below this line.
Expand Down
113 changes: 60 additions & 53 deletions src/libgreen/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::libc::c_void;
use std::uint;
use std::cast::{transmute, transmute_mut_unsafe,
transmute_region, transmute_mut_region};
use stack::Stack;
use std::unstable::stack;
use std::unstable::raw;

// FIXME #7761: Registers is boxed so that it is 16-byte aligned, for storing
// SSE regs. It would be marginally better not to do this. In C++ we
Expand All @@ -22,47 +22,33 @@ use std::unstable::stack;
// the registers are sometimes empty, but the discriminant would
// then misalign the regs again.
pub struct Context {
/// The context entry point, saved here for later destruction
priv start: Option<~proc()>,
/// Hold the registers while the task or scheduler is suspended
priv regs: ~Registers,
/// Lower bound and upper bound for the stack
priv stack_bounds: Option<(uint, uint)>,
}

pub type InitFn = extern "C" fn(uint, *(), *()) -> !;

impl Context {
pub fn empty() -> Context {
Context {
start: None,
regs: new_regs(),
stack_bounds: None,
}
}

/// Create a new context that will resume execution by running proc()
pub fn new(start: proc(), stack: &mut Stack) -> Context {
// The C-ABI function that is the task entry point
//
// Note that this function is a little sketchy. We're taking a
// procedure, transmuting it to a stack-closure, and then calling to
// closure. This leverages the fact that the representation of these two
// types is the same.
//
// The reason that we're doing this is that this procedure is expected
// to never return. The codegen which frees the environment of the
// procedure occurs *after* the procedure has completed, and this means
// that we'll never actually free the procedure.
//
// To solve this, we use this transmute (to not trigger the procedure
// deallocation here), and then store a copy of the procedure in the
// `Context` structure returned. When the `Context` is deallocated, then
// the entire procedure box will be deallocated as well.
extern fn task_start_wrapper(f: &proc()) {
unsafe {
let f: &|| = transmute(f);
(*f)()
}
}
///
/// The `init` function will be run with `arg` and the `start` procedure
/// split up into code and env pointers. It is required that the `init`
/// function never return.
///
/// FIXME: this is basically an awful the interface. The main reason for
/// this is to reduce the number of allocations made when a green
/// task is spawned as much as possible
pub fn new(init: InitFn, arg: uint, start: proc(),
stack: &mut Stack) -> Context {

let sp: *uint = stack.end();
let sp: *mut uint = unsafe { transmute_mut_unsafe(sp) };
Expand All @@ -74,14 +60,10 @@ impl Context {
transmute_region(&*regs));
};

// FIXME #7767: Putting main into a ~ so it's a thin pointer and can
// be passed to the spawn function. Another unfortunate
// allocation
let start = ~start;

initialize_call_frame(&mut *regs,
task_start_wrapper as *c_void,
unsafe { transmute(&*start) },
init,
arg,
unsafe { transmute(start) },
sp);

// Scheduler tasks don't have a stack in the "we allocated it" sense,
Expand All @@ -96,7 +78,6 @@ impl Context {
Some((stack_base as uint, sp as uint))
};
return Context {
start: Some(start),
regs: regs,
stack_bounds: bounds,
}
Expand Down Expand Up @@ -138,7 +119,7 @@ impl Context {
}
}

#[link(name = "rustrt", kind = "static")]
#[link(name = "context_switch", kind = "static")]
extern {
fn rust_swap_registers(out_regs: *mut Registers, in_regs: *Registers);
}
Expand Down Expand Up @@ -185,13 +166,17 @@ fn new_regs() -> ~Registers {
}

#[cfg(target_arch = "x86")]
fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
sp: *mut uint) {
fn initialize_call_frame(regs: &mut Registers, fptr: InitFn, arg: uint,
procedure: raw::Procedure, sp: *mut uint) {

// x86 has interesting stack alignment requirements, so do some alignment
// plus some offsetting to figure out what the actual stack should be.
let sp = align_down(sp);
let sp = mut_offset(sp, -4);

unsafe { *sp = arg as uint };
unsafe { *mut_offset(sp, 2) = procedure.env as uint };
unsafe { *mut_offset(sp, 1) = procedure.code as uint };
unsafe { *mut_offset(sp, 0) = arg as uint };
let sp = mut_offset(sp, -1);
unsafe { *sp = 0 }; // The final return address

Expand All @@ -215,14 +200,18 @@ fn new_regs() -> ~Registers { ~([0, .. 34]) }
fn new_regs() -> ~Registers { ~([0, .. 22]) }

#[cfg(target_arch = "x86_64")]
fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
sp: *mut uint) {
fn initialize_call_frame(regs: &mut Registers, fptr: InitFn, arg: uint,
procedure: raw::Procedure, sp: *mut uint) {
extern { fn rust_bootstrap_green_task(); }

// Redefinitions from rt/arch/x86_64/regs.h
static RUSTRT_ARG0: uint = 3;
static RUSTRT_RSP: uint = 1;
static RUSTRT_IP: uint = 8;
static RUSTRT_RBP: uint = 2;
static RUSTRT_R12: uint = 4;
static RUSTRT_R13: uint = 5;
static RUSTRT_R14: uint = 6;
static RUSTRT_R15: uint = 7;

let sp = align_down(sp);
let sp = mut_offset(sp, -1);
Expand All @@ -231,13 +220,23 @@ fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
unsafe { *sp = 0; }

rtdebug!("creating call frame");
rtdebug!("fptr {}", fptr);
rtdebug!("arg {}", arg);
rtdebug!("fptr {:#x}", fptr as uint);
rtdebug!("arg {:#x}", arg);
rtdebug!("sp {}", sp);

regs[RUSTRT_ARG0] = arg as uint;
// These registers are frobbed by rust_bootstrap_green_task into the right
// location so we can invoke the "real init function", `fptr`.
regs[RUSTRT_R12] = arg as uint;
regs[RUSTRT_R13] = procedure.code as uint;
regs[RUSTRT_R14] = procedure.env as uint;
regs[RUSTRT_R15] = fptr as uint;

// These registers are picked up by the regulard context switch paths. These
// will put us in "mostly the right context" except for frobbing all the
// arguments to the right place. We have the small trampoline code inside of
// rust_bootstrap_green_task to do that.
regs[RUSTRT_RSP] = sp as uint;
regs[RUSTRT_IP] = fptr as uint;
regs[RUSTRT_IP] = rust_bootstrap_green_task as uint;

// Last base pointer on the stack should be 0
regs[RUSTRT_RBP] = 0;
Expand All @@ -250,18 +249,26 @@ type Registers = [uint, ..32];
fn new_regs() -> ~Registers { ~([0, .. 32]) }

#[cfg(target_arch = "arm")]
fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
sp: *mut uint) {
fn initialize_call_frame(regs: &mut Registers, fptr: InitFn, arg: uint,
procedure: raw::Procedure, sp: *mut uint) {
extern { fn rust_bootstrap_green_task(); }

let sp = align_down(sp);
// sp of arm eabi is 8-byte aligned
let sp = mut_offset(sp, -2);

// The final return address. 0 indicates the bottom of the stack
unsafe { *sp = 0; }

regs[0] = arg as uint; // r0
regs[13] = sp as uint; // #53 sp, r13
regs[14] = fptr as uint; // #60 pc, r15 --> lr
// ARM uses the same technique as x86_64 to have a landing pad for the start
// of all new green tasks. Neither r1/r2 are saved on a context switch, so
// the shim will copy r3/r4 into r1/r2 and then execute the function in r5
regs[0] = arg as uint; // r0
regs[3] = procedure.code as uint; // r3
regs[4] = procedure.env as uint; // r4
regs[5] = fptr as uint; // r5
regs[13] = sp as uint; // #52 sp, r13
regs[14] = rust_bootstrap_green_task as uint; // #56 pc, r14 --> lr
}

#[cfg(target_arch = "mips")]
Expand All @@ -271,8 +278,8 @@ type Registers = [uint, ..32];
fn new_regs() -> ~Registers { ~([0, .. 32]) }

#[cfg(target_arch = "mips")]
fn initialize_call_frame(regs: &mut Registers, fptr: *c_void, arg: *c_void,
sp: *mut uint) {
fn initialize_call_frame(regs: &mut Registers, fptr: InitFn, arg: uint,
procedure: raw::Procedure, sp: *mut uint) {
let sp = align_down(sp);
// sp of mips o32 is 8-byte aligned
let sp = mut_offset(sp, -2);
Expand Down
18 changes: 0 additions & 18 deletions src/libgreen/coroutine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
// Coroutines represent nothing more than a context and a stack
// segment.

use std::rt::env;

use context::Context;
use stack::{StackPool, Stack};

Expand All @@ -31,22 +29,6 @@ pub struct Coroutine {
}

impl Coroutine {
pub fn new(stack_pool: &mut StackPool,
stack_size: Option<uint>,
start: proc())
-> Coroutine {
let stack_size = match stack_size {
Some(size) => size,
None => env::min_stack()
};
let mut stack = stack_pool.take_stack(stack_size);
let initial_context = Context::new(start, &mut stack);
Coroutine {
current_stack_segment: stack,
saved_context: initial_context
}
}

pub fn empty() -> Coroutine {
Coroutine {
current_stack_segment: unsafe { Stack::dummy_stack() },
Expand Down
2 changes: 1 addition & 1 deletion src/libgreen/sched.rs
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ impl Scheduler {

/// Called by a running task to end execution, after which it will
/// be recycled by the scheduler for reuse in a new task.
pub fn terminate_current_task(mut ~self, cur: ~GreenTask) {
pub fn terminate_current_task(mut ~self, cur: ~GreenTask) -> ! {
// Similar to deschedule running task and then, but cannot go through
// the task-blocking path. The task is already dying.
let stask = self.sched_task.take_unwrap();
Expand Down
34 changes: 32 additions & 2 deletions src/libgreen/stack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,9 @@ impl StackPool {

pub fn take_stack(&mut self, min_size: uint) -> Stack {
// Ideally this would be a binary search
match self.stacks.iter().position(|s| s.min_size < min_size) {
match self.stacks.iter().position(|s| min_size <= s.min_size) {
Some(idx) => self.stacks.swap_remove(idx),
None => Stack::new(min_size)
None => Stack::new(min_size)
}
}

Expand All @@ -156,3 +156,33 @@ extern {
end: *libc::uintptr_t) -> libc::c_uint;
fn rust_valgrind_stack_deregister(id: libc::c_uint);
}

#[cfg(test)]
mod tests {
use super::StackPool;

#[test]
fn stack_pool_caches() {
let mut p = StackPool::new();
let s = p.take_stack(10);
p.give_stack(s);
let s = p.take_stack(4);
assert_eq!(s.min_size, 10);
p.give_stack(s);
let s = p.take_stack(14);
assert_eq!(s.min_size, 14);
p.give_stack(s);
}

#[test]
fn stack_pool_caches_exact() {
let mut p = StackPool::new();
let mut s = p.take_stack(10);
s.valgrind_id = 100;
p.give_stack(s);

let s = p.take_stack(10);
assert_eq!(s.min_size, 10);
assert_eq!(s.valgrind_id, 100);
}
}
Loading

0 comments on commit 22c34f3

Please sign in to comment.