Implement switching between functions without using threads

vron · vron · commit 0e482f48310a · 2020-07-19T19:02:27.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 data/**
 build/**
 *.DS_Store
-.vscode/*
+.vscode
 # ignore generated files in the main folder
diff --git a/README.md b/README.md
@@ -51,4 +51,6 @@ TODO: Write once I have commited to a design
  - Rewor api such that array of array instead of merging all the arrays when that is the case..
  - support multi-file shader with macros
  - Gain more performance but letting the compiler now we have ensured all the alignements
- - Document that all fields not of slices will be COPIED to not mess with cgo ( i.e  vec4[1000] is a bad idea perforamnce wise..)
+ - Document that all fields not of slices will be COPIED to not mess with cgo ( i.e  vec4[1000] is a bad idea perforamnce wise..)
+ - Add pathological test cases such as wg size 0 etc. etc.
+ - Set the NDEBUG flag to remove all those asserts..    
diff --git a/runtime/routines/.gitignore b/runtime/routines/.gitignore
@@ -0,0 +1,4 @@
+a.out
+build.sh
+test*.cpp
+a.out.dSYM
diff --git a/runtime/routines/arch/amd64_nix.S b/runtime/routines/arch/amd64_nix.S
@@ -0,0 +1,39 @@
+.text
+
+
+.globl amd64_switch
+#ifndef __APPLE__
+.type  amd64_switch, @function
+#endif
+.intel_syntax noprefix
+/*
+    // https://en.wikipedia.org/wiki/X86_calling_conventions
+    // https://aaronbloomfield.github.io/pdr/book/x86-64bit-ccc-chapter.pdf
+*/
+amd64_switch:
+    /* Store the current context */
+    mov     QWORD PTR [rsi+0x0],rbx     // register assumed unm. through a call
+    mov     QWORD PTR [rsi+0x8],rbp     // register assumed unm. through a call
+    mov     QWORD PTR [rsi+0x10],r12    // register assumed unm. through a call
+    mov     QWORD PTR [rsi+0x18],r13    // register assumed unm. through a call
+    mov     QWORD PTR [rsi+0x20],r14    // register assumed unm. through a call
+    mov     QWORD PTR [rsi+0x28],r15    // register assumed unm. through a call
+    mov     rdx,QWORD PTR [rsp]         // load the return address from top of stack
+    mov     QWORD PTR [rsi+0x30],rdx
+    lea     rcx,[rsp+0x8]               // stack without return, lea to not change any flgs
+    mov     QWORD PTR [rsi+0x38],rcx
+    fnstcw  WORD PTR  [rsi+0x40]        // FPU control word  
+    stmxcsr DWORD PTR [rsi+0x44]        // MXCSR control and status register 
+    /* restore the destination context */
+    mov     rbx,QWORD PTR [rdi+0x0]
+    mov     rbp,QWORD PTR [rdi+0x8]
+    mov     r12,QWORD PTR [rdi+0x10]
+    mov     r13,QWORD PTR [rdi+0x18]
+    mov     r14,QWORD PTR [rdi+0x20]
+    mov     r15,QWORD PTR [rdi+0x28]
+    fldcw   WORD PTR      [rdi+0x40]
+    ldmxcsr DWORD PTR     [rdi+0x44]
+    mov     rax,QWORD PTR [rdi+0x30]
+    mov     rcx,QWORD PTR [rdi+0x38]
+    mov     rsp,rcx
+    jmp     rax
diff --git a/runtime/routines/arch/arch.hpp b/runtime/routines/arch/arch.hpp
@@ -0,0 +1,25 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__x86_64__) && (defined(__linux__) || defined(__APPLE__))
+// System V AMD64 ABI here
+#define ARCH_reg_state 10
+// the first argument to the func should also be set to to, here implicitly
+// handled since we call this function with it..
+void ARCH_switch(void *to, void *from) __asm__("amd64_switch");
+void ARCH_set_register_state(void **registers, void *fp, void *stack,
+                             size_t stack_size) {
+  // on x86 the stack grows downwards, so we want to find a pointer to the
+  // end of the stack instead, but we want one that is aligned on a 16 byte
+  // boundary
+  uintptr_t u_p = (uintptr_t)(stack) + (uintptr_t)(stack_size)-1;
+  u_p = u_p & (~0xF);
+  // store the function pointer where it will be restored from the assembly
+  registers[6] = fp;
+  registers[7] = (void *)(u_p - 24);
+};
+
+#else
+#error "unsupported (arch, ABI) combination encountered"
+#endif
diff --git a/runtime/routines/atemp.hpp b/runtime/routines/atemp.hpp
@@ -0,0 +1,30 @@
+#pragma once
+#include "routines.hpp"
+#include <cstdio>
+
+class TempArg;
+
+struct shader {
+  Invocation<TempArg> *thread;
+  int id;
+
+  shader(){};
+
+  void main() {
+    printf("started %d\n", this->id);
+    barrier();
+    printf("barriered %d\n", this->id);
+    barrier();
+    printf("done %d\n", this->id);
+  }
+
+  void barrier();
+};
+
+class TempArg {
+public:
+  int no;
+  shader *shader;
+  TempArg() : no(0){};
+  TempArg(int no, struct shader *s) : no(no), shader(s){};
+};
diff --git a/runtime/routines/barrier.hpp b/runtime/routines/barrier.hpp
@@ -0,0 +1,36 @@
+#pragma once
+#include <condition_variable>
+#include <mutex>
+
+class Barrier {
+private:
+  std::mutex m;
+  std::condition_variable cv;
+  int count;
+  int gen;
+  int original_count;
+
+public:
+  Barrier() : count(0), gen(0), original_count(0){};
+
+  void set_count(int c) {
+    std::unique_lock<std::mutex> lc(m);
+    assert(this->count == this->original_count);
+    count = c;
+    original_count = c;
+  }
+
+  void wait() {
+    std::unique_lock<std::mutex> lc(m);
+    int tempgen = gen;
+    count--;
+    if (count == 0) {
+      gen++;
+      count = original_count;
+      lc.unlock();
+      cv.notify_all();
+    } else {
+      cv.wait(lc, [tempgen, this] { return tempgen != this->gen; });
+    }
+  };
+};
diff --git a/runtime/routines/invocation.hpp b/runtime/routines/invocation.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "routines.hpp"
+#include <cassert>
+
+template <class T> class Invocation {
+  InvocationState state; // MUST be first member
+  bool finished;
+  T arg;
+  WorkThread<T> *wt;
+
+private:
+  Invocation(int stack_size)
+      : state(InvocationState(stack_size)), finished(true){};
+
+public:
+  // exit this thread and signal that it is done, a thread function
+  // must never return without calling exit.
+  void exit() {
+    this->finished = true;
+    this->barrier();
+  };
+
+  // re-configure the thread to run the given function with the given arg
+  void set_function(void (*fp)(Invocation<T> *), T arg) {
+    assert(this->finished);
+    this->finished = false;
+    this->arg = arg;
+    ARCH_set_register_state(this->state.registers, (void *)fp,
+                            this->state.stack, this->state.stack_size);
+  }
+
+  T get_argument() { return this->arg; };
+
+  void barrier() {
+    assert(this->wt != nullptr);
+    ARCH_switch(&this->wt->state, &this->state);
+  };
+
+  friend class WorkThread<T>;
+  friend class WorkGroup<T>;
+};
diff --git a/runtime/routines/invocationstate.hpp b/runtime/routines/invocationstate.hpp
@@ -0,0 +1,32 @@
+#pragma once
+#include "routines.hpp"
+#include <cstdint>
+#include <cstdlib>
+#include <cassert>
+
+class InvocationState {
+public:
+  void *registers[ARCH_reg_state];
+  void *stack;
+  size_t stack_size;
+
+public:
+  InvocationState(int stack_size) {
+    assert(stack_size > 0);
+
+    // zero out the register state
+    for (int i = 0; i < ARCH_reg_state; i++) {
+      this->registers[i] = 0;
+    }
+
+    this->stack_size = stack_size;
+    this->stack = malloc(stack_size); // TODO: this could fail
+    // TODO(vron): use guard page at end of stack to catch stack overflows
+  }
+
+  ~InvocationState() {
+    if (!this->stack)
+      return;
+    free(this->stack);
+  };
+};
diff --git a/runtime/routines/main.cpp b/runtime/routines/main.cpp
@@ -0,0 +1,40 @@
+#include "atemp.hpp"
+#include "routines.hpp"
+#include <cstdio>
+
+#define WG_SIZE 5
+#define NO_DI 3
+
+void coroutine(Invocation<TempArg> *th) {
+  auto td = th->get_argument();
+  td.shader->thread = th;
+  td.shader->main();
+  th->exit();
+}
+
+int main() {
+  auto wg = new WorkGroup<TempArg>(2, 1024 * 1024);
+
+  Invocation<TempArg> *threads[WG_SIZE];
+  shader shaders[WG_SIZE];
+  for (int i = 0; i < WG_SIZE; i++) {
+    threads[i] = wg->create_thread();
+  }
+
+  // for each wg dispatch:
+  for (int n = 0; n < NO_DI; n++) {
+    for (int i = 0; i < WG_SIZE; i++) {
+      // do set invocation id's etc
+      shaders[i].thread = threads[i];
+      shaders[i].id = i + (n + 1) * 100;
+      threads[i]->set_function(&coroutine,
+                               TempArg(i + (n + 1) * 100, &shaders[i]));
+    }
+
+    wg->run(WG_SIZE, threads);
+  }
+  delete wg;
+  return 0;
+}
+
+void shader::barrier() { this->thread->barrier(); }
diff --git a/runtime/routines/routines.hpp b/runtime/routines/routines.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+template <class T> class Invocation;
+template <class T> class WorkGroup;
+template <class T> class WorkThread;
+
+#include "../debug.hpp"
+#include "arch/arch.hpp"
+#include "invocationstate.hpp"
+#include "barrier.hpp"
+#include "invocation.hpp"
+#include "workpiece.hpp"
+#include "workqueue.hpp"
+#include "workthread.hpp"
+#include "workgroup.hpp"
diff --git a/runtime/routines/workgroup.hpp b/runtime/routines/workgroup.hpp
@@ -0,0 +1,65 @@
+#pragma once
+
+// TODO: investigate false sharing and thread locations (i thin we should use 64
+// byte separation all over the place...)
+
+#include "routines.hpp"
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+template <class T> class WorkGroup {
+public:
+  WorkQueue<T> sync;
+  int num_thread;
+  WorkThread<T> **workers;
+  int stack_size;
+
+public:
+  WorkGroup(int num_thread, int stack_size)
+      : num_thread(num_thread), stack_size(stack_size) {
+    assert(num_thread > 0);
+    this->workers =
+        (WorkThread<T> **)malloc(num_thread * sizeof(WorkThread<T> *));
+    for (int i = 0; i < num_thread; i++) {
+      this->workers[i] = new WorkThread<T>(&this->sync);
+    }
+  }
+
+  Invocation<T> *create_thread() { return (new Invocation<T>(stack_size)); };
+
+  void run(int no, Invocation<T> **threads) {
+    // simply split the work equally and split across threads. Downside of this
+    // approach is that one thread might finish before. Upside is that we will largely
+    // avoid false sharing and likely improve cache usage since subsequent invocations will
+    // likely access subsequent array elements etc. Consider these two effects when / if
+    // the scheduling is redone to something smarter.
+    int nt = no < this->num_thread ? no : num_thread;
+    int si = 0;
+    int e_each = (no + (nt - 1)) / nt;
+    sync.barrier.set_count(nt);
+    for (int ti = 0; ti < nt; ti++) {
+      int e = e_each;
+      if (e + si > no) {
+        e = no - si;
+      }
+      sync.send_work(WorkPiece<T>(e, &threads[si]));
+      si += e;
+    }
+
+    for (int ti = 0; ti < nt; ti++) {
+      sync.wait_for_done();
+    }
+  };
+
+  ~WorkGroup() {
+    // signal threads and wait for them to quit
+    for (int i = 0; i < this->num_thread; i++) {
+      this->sync.send_work(WorkPiece<T>(0, nullptr));
+    }
+    for (int i = 0; i < this->num_thread; i++) {
+      delete workers[i];
+    }
+    free(this->workers);
+  }
+};
diff --git a/runtime/routines/workpiece.hpp b/runtime/routines/workpiece.hpp
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "routines.hpp"
+
+template <class T> class WorkPiece {
+public:
+  int no;
+  Invocation<T> **threads; // nullptr => end of computation
+  WorkPiece() : no(0){};
+  WorkPiece(int no, Invocation<T> **threads) : no(no), threads(threads){};
+};
diff --git a/runtime/routines/workqueue.hpp b/runtime/routines/workqueue.hpp
diff --git a/runtime/routines/workthread.hpp b/runtime/routines/workthread.hpp