Skip to content

Commit 0e482f4

Browse files
committed
Implement switching between functions without using threads
1 parent dc9e0f9 commit 0e482f4

15 files changed

+438
-2
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
data/**
22
build/**
33
*.DS_Store
4-
.vscode/*
4+
.vscode
55
# ignore generated files in the main folder

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,6 @@ TODO: Write once I have commited to a design
5151
- Rewor api such that array of array instead of merging all the arrays when that is the case..
5252
- support multi-file shader with macros
5353
- Gain more performance but letting the compiler now we have ensured all the alignements
54-
- Document that all fields not of slices will be COPIED to not mess with cgo ( i.e vec4[1000] is a bad idea perforamnce wise..)
54+
- Document that all fields not of slices will be COPIED to not mess with cgo ( i.e vec4[1000] is a bad idea perforamnce wise..)
55+
- Add pathological test cases such as wg size 0 etc. etc.
56+
- Set the NDEBUG flag to remove all those asserts..

runtime/routines/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
a.out
2+
build.sh
3+
test*.cpp
4+
a.out.dSYM

runtime/routines/arch/amd64_nix.S

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
.text
2+
3+
4+
.globl amd64_switch
5+
#ifndef __APPLE__
6+
.type amd64_switch, @function
7+
#endif
8+
.intel_syntax noprefix
9+
/*
10+
// https://en.wikipedia.org/wiki/X86_calling_conventions
11+
// https://aaronbloomfield.github.io/pdr/book/x86-64bit-ccc-chapter.pdf
12+
*/
13+
amd64_switch:
14+
/* Store the current context */
15+
mov QWORD PTR [rsi+0x0],rbx // register assumed unm. through a call
16+
mov QWORD PTR [rsi+0x8],rbp // register assumed unm. through a call
17+
mov QWORD PTR [rsi+0x10],r12 // register assumed unm. through a call
18+
mov QWORD PTR [rsi+0x18],r13 // register assumed unm. through a call
19+
mov QWORD PTR [rsi+0x20],r14 // register assumed unm. through a call
20+
mov QWORD PTR [rsi+0x28],r15 // register assumed unm. through a call
21+
mov rdx,QWORD PTR [rsp] // load the return address from top of stack
22+
mov QWORD PTR [rsi+0x30],rdx
23+
lea rcx,[rsp+0x8] // stack without return, lea to not change any flgs
24+
mov QWORD PTR [rsi+0x38],rcx
25+
fnstcw WORD PTR [rsi+0x40] // FPU control word
26+
stmxcsr DWORD PTR [rsi+0x44] // MXCSR control and status register
27+
/* restore the destination context */
28+
mov rbx,QWORD PTR [rdi+0x0]
29+
mov rbp,QWORD PTR [rdi+0x8]
30+
mov r12,QWORD PTR [rdi+0x10]
31+
mov r13,QWORD PTR [rdi+0x18]
32+
mov r14,QWORD PTR [rdi+0x20]
33+
mov r15,QWORD PTR [rdi+0x28]
34+
fldcw WORD PTR [rdi+0x40]
35+
ldmxcsr DWORD PTR [rdi+0x44]
36+
mov rax,QWORD PTR [rdi+0x30]
37+
mov rcx,QWORD PTR [rdi+0x38]
38+
mov rsp,rcx
39+
jmp rax

runtime/routines/arch/arch.hpp

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#pragma once
2+
#include <cstddef>
3+
#include <cstdint>
4+
5+
#if defined(__x86_64__) && (defined(__linux__) || defined(__APPLE__))
6+
// System V AMD64 ABI here
7+
#define ARCH_reg_state 10
8+
// the first argument to the func should also be set to to, here implicitly
9+
// handled since we call this function with it..
10+
void ARCH_switch(void *to, void *from) __asm__("amd64_switch");
11+
void ARCH_set_register_state(void **registers, void *fp, void *stack,
12+
size_t stack_size) {
13+
// on x86 the stack grows downwards, so we want to find a pointer to the
14+
// end of the stack instead, but we want one that is aligned on a 16 byte
15+
// boundary
16+
uintptr_t u_p = (uintptr_t)(stack) + (uintptr_t)(stack_size)-1;
17+
u_p = u_p & (~0xF);
18+
// store the function pointer where it will be restored from the assembly
19+
registers[6] = fp;
20+
registers[7] = (void *)(u_p - 24);
21+
};
22+
23+
#else
24+
#error "unsupported (arch, ABI) combination encountered"
25+
#endif

runtime/routines/atemp.hpp

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#pragma once
2+
#include "routines.hpp"
3+
#include <cstdio>
4+
5+
class TempArg;
6+
7+
struct shader {
8+
Invocation<TempArg> *thread;
9+
int id;
10+
11+
shader(){};
12+
13+
void main() {
14+
printf("started %d\n", this->id);
15+
barrier();
16+
printf("barriered %d\n", this->id);
17+
barrier();
18+
printf("done %d\n", this->id);
19+
}
20+
21+
void barrier();
22+
};
23+
24+
class TempArg {
25+
public:
26+
int no;
27+
shader *shader;
28+
TempArg() : no(0){};
29+
TempArg(int no, struct shader *s) : no(no), shader(s){};
30+
};

runtime/routines/barrier.hpp

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#pragma once
2+
#include <condition_variable>
3+
#include <mutex>
4+
5+
class Barrier {
6+
private:
7+
std::mutex m;
8+
std::condition_variable cv;
9+
int count;
10+
int gen;
11+
int original_count;
12+
13+
public:
14+
Barrier() : count(0), gen(0), original_count(0){};
15+
16+
void set_count(int c) {
17+
std::unique_lock<std::mutex> lc(m);
18+
assert(this->count == this->original_count);
19+
count = c;
20+
original_count = c;
21+
}
22+
23+
void wait() {
24+
std::unique_lock<std::mutex> lc(m);
25+
int tempgen = gen;
26+
count--;
27+
if (count == 0) {
28+
gen++;
29+
count = original_count;
30+
lc.unlock();
31+
cv.notify_all();
32+
} else {
33+
cv.wait(lc, [tempgen, this] { return tempgen != this->gen; });
34+
}
35+
};
36+
};

runtime/routines/invocation.hpp

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#pragma once
2+
3+
#include "routines.hpp"
4+
#include <cassert>
5+
6+
template <class T> class Invocation {
7+
InvocationState state; // MUST be first member
8+
bool finished;
9+
T arg;
10+
WorkThread<T> *wt;
11+
12+
private:
13+
Invocation(int stack_size)
14+
: state(InvocationState(stack_size)), finished(true){};
15+
16+
public:
17+
// exit this thread and signal that it is done, a thread function
18+
// must never return without calling exit.
19+
void exit() {
20+
this->finished = true;
21+
this->barrier();
22+
};
23+
24+
// re-configure the thread to run the given function with the given arg
25+
void set_function(void (*fp)(Invocation<T> *), T arg) {
26+
assert(this->finished);
27+
this->finished = false;
28+
this->arg = arg;
29+
ARCH_set_register_state(this->state.registers, (void *)fp,
30+
this->state.stack, this->state.stack_size);
31+
}
32+
33+
T get_argument() { return this->arg; };
34+
35+
void barrier() {
36+
assert(this->wt != nullptr);
37+
ARCH_switch(&this->wt->state, &this->state);
38+
};
39+
40+
friend class WorkThread<T>;
41+
friend class WorkGroup<T>;
42+
};

runtime/routines/invocationstate.hpp

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#pragma once
2+
#include "routines.hpp"
3+
#include <cstdint>
4+
#include <cstdlib>
5+
#include <cassert>
6+
7+
class InvocationState {
8+
public:
9+
void *registers[ARCH_reg_state];
10+
void *stack;
11+
size_t stack_size;
12+
13+
public:
14+
InvocationState(int stack_size) {
15+
assert(stack_size > 0);
16+
17+
// zero out the register state
18+
for (int i = 0; i < ARCH_reg_state; i++) {
19+
this->registers[i] = 0;
20+
}
21+
22+
this->stack_size = stack_size;
23+
this->stack = malloc(stack_size); // TODO: this could fail
24+
// TODO(vron): use guard page at end of stack to catch stack overflows
25+
}
26+
27+
~InvocationState() {
28+
if (!this->stack)
29+
return;
30+
free(this->stack);
31+
};
32+
};

runtime/routines/main.cpp

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#include "atemp.hpp"
2+
#include "routines.hpp"
3+
#include <cstdio>
4+
5+
#define WG_SIZE 5
6+
#define NO_DI 3
7+
8+
void coroutine(Invocation<TempArg> *th) {
9+
auto td = th->get_argument();
10+
td.shader->thread = th;
11+
td.shader->main();
12+
th->exit();
13+
}
14+
15+
int main() {
16+
auto wg = new WorkGroup<TempArg>(2, 1024 * 1024);
17+
18+
Invocation<TempArg> *threads[WG_SIZE];
19+
shader shaders[WG_SIZE];
20+
for (int i = 0; i < WG_SIZE; i++) {
21+
threads[i] = wg->create_thread();
22+
}
23+
24+
// for each wg dispatch:
25+
for (int n = 0; n < NO_DI; n++) {
26+
for (int i = 0; i < WG_SIZE; i++) {
27+
// do set invocation id's etc
28+
shaders[i].thread = threads[i];
29+
shaders[i].id = i + (n + 1) * 100;
30+
threads[i]->set_function(&coroutine,
31+
TempArg(i + (n + 1) * 100, &shaders[i]));
32+
}
33+
34+
wg->run(WG_SIZE, threads);
35+
}
36+
delete wg;
37+
return 0;
38+
}
39+
40+
void shader::barrier() { this->thread->barrier(); }

runtime/routines/routines.hpp

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#pragma once
2+
3+
template <class T> class Invocation;
4+
template <class T> class WorkGroup;
5+
template <class T> class WorkThread;
6+
7+
#include "../debug.hpp"
8+
#include "arch/arch.hpp"
9+
#include "invocationstate.hpp"
10+
#include "barrier.hpp"
11+
#include "invocation.hpp"
12+
#include "workpiece.hpp"
13+
#include "workqueue.hpp"
14+
#include "workthread.hpp"
15+
#include "workgroup.hpp"

runtime/routines/workgroup.hpp

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#pragma once
2+
3+
// TODO: investigate false sharing and thread locations (i thin we should use 64
4+
// byte separation all over the place...)
5+
6+
#include "routines.hpp"
7+
#include <condition_variable>
8+
#include <mutex>
9+
#include <thread>
10+
11+
template <class T> class WorkGroup {
12+
public:
13+
WorkQueue<T> sync;
14+
int num_thread;
15+
WorkThread<T> **workers;
16+
int stack_size;
17+
18+
public:
19+
WorkGroup(int num_thread, int stack_size)
20+
: num_thread(num_thread), stack_size(stack_size) {
21+
assert(num_thread > 0);
22+
this->workers =
23+
(WorkThread<T> **)malloc(num_thread * sizeof(WorkThread<T> *));
24+
for (int i = 0; i < num_thread; i++) {
25+
this->workers[i] = new WorkThread<T>(&this->sync);
26+
}
27+
}
28+
29+
Invocation<T> *create_thread() { return (new Invocation<T>(stack_size)); };
30+
31+
void run(int no, Invocation<T> **threads) {
32+
// simply split the work equally and split across threads. Downside of this
33+
// approach is that one thread might finish before. Upside is that we will largely
34+
// avoid false sharing and likely improve cache usage since subsequent invocations will
35+
// likely access subsequent array elements etc. Consider these two effects when / if
36+
// the scheduling is redone to something smarter.
37+
int nt = no < this->num_thread ? no : num_thread;
38+
int si = 0;
39+
int e_each = (no + (nt - 1)) / nt;
40+
sync.barrier.set_count(nt);
41+
for (int ti = 0; ti < nt; ti++) {
42+
int e = e_each;
43+
if (e + si > no) {
44+
e = no - si;
45+
}
46+
sync.send_work(WorkPiece<T>(e, &threads[si]));
47+
si += e;
48+
}
49+
50+
for (int ti = 0; ti < nt; ti++) {
51+
sync.wait_for_done();
52+
}
53+
};
54+
55+
~WorkGroup() {
56+
// signal threads and wait for them to quit
57+
for (int i = 0; i < this->num_thread; i++) {
58+
this->sync.send_work(WorkPiece<T>(0, nullptr));
59+
}
60+
for (int i = 0; i < this->num_thread; i++) {
61+
delete workers[i];
62+
}
63+
free(this->workers);
64+
}
65+
};

runtime/routines/workpiece.hpp

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#pragma once
2+
3+
#include "routines.hpp"
4+
5+
template <class T> class WorkPiece {
6+
public:
7+
int no;
8+
Invocation<T> **threads; // nullptr => end of computation
9+
WorkPiece() : no(0){};
10+
WorkPiece(int no, Invocation<T> **threads) : no(no), threads(threads){};
11+
};

0 commit comments

Comments
 (0)