-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcoroutines.h
294 lines (255 loc) · 12.4 KB
/
coroutines.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
/*
* coroutines.h
*
* See https://medium.com/software-development-2/coroutines-and-fibers-why-and-when-5798f08464fd#.hsfh18d5f
*
* The libc get/make/swap-context() API is somewhat more powerful, for makecontext() can setup as trampoline a function
* that accepts arbitrary number of arguments. The downside is complexity.
* Originally, this library provided alternative implementations to those APIs. It was only storing and restoring the regs ABI stipulates we should track, doing away with
* signals mask and floating points regs. It was working fine.
*
* I don't really need that - I only need is a single argument, though it should be pretty straight forward to support arbitrary number of arguments
* like makecontext() does, but the overhead and complexity required to implement that do not justify pursuing this change.
* See https://github.com/lattera/glibc /sysdeps/unix/sysv/linux/x86_64/ for glibc implementation specifics.
*
* Instead, what makes more sense and is infinitely simpler is to use a static thread_local for the trampoline's context.
*
* We could also support more platforms/arcs, and other means of control transfer.
* As explained by https://github.com/clibs/coro/coro.h, other approaches include:
* - use of SUSv2's get/set/swap/makecontext API. Supported by some systems, and slow
* - use of SUSv2's setjmp/longjmp and sigaltstack APIs to specify a stack for a signal handler's stack. Much slower to create coro than ucontext APIs, but somewhat faster to switch between them.
* We can and should use the alternative _setjmp/_longjmp instead (will not incur the system call to save and restore the siganls mask) - which results in much higher performance.
* This is supported by all Unix flavors
* - variant for glibc <= 2.1, for Linux, which is very fast, but requires older glibc. This comes down to use of a different use of sigsetjmp() and siglongjmp() APIs
* - use of the PThreads API. This is very slow and better be avoided
* If you need to support multiple archs, use the excellent https://github.com/clibs/coro
*
* I suppose I could just provide support to other unix flavors via getcontext()/makecontext()/setcontext() for initialization and setjmp()/longjmp() for switches which is faster
* until assembly optimized implementations to coro_transfer() and coro_resume() can be implemented. (see SeaStar thread_context design)
*
*
* PERFORMANCE
* On a Xeon at 2Ghz, we get about 21million context switches/second. This is far more than we would need, assuming appropriate use of coroutines/fibers and context switching
* where it makes sense(e.g before blocking ops, or when a coro has been running for too long), which means there will be an infrequent need for context switching, it should
* be the 'solution' to achieving very high performance.
*
*/
#pragma once
#if !defined(__amd64) || !defined(__linux)
// TODO: support i386-linux(trivial). Support for other unix flavors via makecontext(), setjmp() and longjmp(). (that's what Seastar does it)
#error Unsupported platform/arch
#endif
#define VALGRIND_SUPPORT 1
#include <switch.h>
#ifdef VALGRIND_SUPPORT
#include <valgrind/valgrind.h>
#endif
#include <switch_bitops.h>
#include <switch_ll.h>
#include <switch_mallocators.h>
#include <switch_vector.h>
#pragma mark Coroutines
struct coro_stack
{
void *stack;
size_t size;
uint8_t flags{0};
#ifdef VALGRIND_SUPPORT
int valgrindId;
#endif
bool alloc(size_t hint, const uint8_t protect = false);
void release();
bool set(void *const p, const size_t s);
};
struct coro_ctx
{
void **sp; // must be offset 0 in coro_ctx; see coro_transfer and coro_resume impl.
};
// We absolutely need to make sure the compiler won't inline those (e.g -O2 or higher)
extern "C" {
extern void __attribute__((__noinline__, __regparm__(2)))
coro_transfer(coro_ctx *prev /* rdi */, coro_ctx *next /* rsi */);
extern void __attribute__((__noinline__, __regparm__(1)))
coro_resume(coro_ctx *ctx /* rdi */);
};
// http://wiki.osdev.org/System_V_ABI
// functions are called using the CALL instruction, which pushes the address of the next instruction to the stack, and then jumps to the operand
// functions return to the caller using the RET instruction, which pops a value from the stack and jumps to it.
//
// The stack is 16-byte aligned just before the call instruction is called
//
// Functions preserve rbx, rsp, rbp, r12, r13, r14 and r15; while rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 are scratch regs.
// The return value is stored in the rax reg, or if it is a 128-bit value, then the higher 64-bits go in rdx
//
// Optionally, functions push rbp such that the caller-return-rip is 8 bytes above it, and erp to the addres of the saved rbp. this allows iterating the existing stack frames.
void coro_init_with_trampoline(coro_ctx *ctx, void (*func)(void *), void *arg, coro_stack *const coroStack, void (*trampoline)());
void coro_init(coro_ctx *ctx, void (*func)(void *), void *arg, coro_stack *const coroStack);
#pragma mark Fibers
// We need both different prios and time tracking, but let's make this tunable here
#define HAVE_FIBER_PRIOS 1 // Support different priorities
#define HAVE_FIBER_TS 1 // track coroutines timestamps(running since, ended at)
namespace Fibers
{
void FiberTrampoline();
struct fiber
{
coro_ctx ctx;
coro_stack stack;
#ifdef HAVE_FIBER_PRIOS
// fibers of priority X will be scheduled before those of priority > X
// 0 is the highest prio, 7 is the lowest
uint8_t prio;
#endif
#ifdef HAVE_FIBER_TS
uint64_t ts; // if currently running, when it was scheduled, otherwise, when it stopped running
#endif
switch_dlist list; // in ready list for its priority
fiber *awaits; // if set, another fiber is waiting for this fiber to complete
std::function<void()> L;
};
// this is go's scheduler implementation
// https://github.com/golang/go/blob/master/src/runtime/proc.go
// findrunnable() is very interesting. Will GC if needed.
// It first checks the local runq. If it is empty, it then checks the global run queue.
// It then check netpoll, and then randomly steals from other Ps (see M, N, P)
struct Scheduler
{
public:
fiber *cur{nullptr};
private:
#ifdef HAVE_FIBER_PRIOS
uint8_t readyMask{0};
#endif
public:
uint32_t readyCnt{0};
// For performance measurements
uint32_t ctxSwitches{0};
private:
#ifdef HAVE_FIBER_PRIOS
switch_dlist ready[8];
#else
switch_dlist ready;
#endif
public:
uint32_t aliveCnt{0}; // includes suspended and not runnable count
simple_allocator allocator;
Switch::vector<fiber *> freeFibers;
Switch::vector<coro_stack> freeStacks;
private:
fiber *get_fiber();
void put_fiber(fiber *const f);
// Makes fiber `f` runnable. That is, it will be eventually schedulef by trasnfering execution control to it
void make_ready(fiber *const f);
// Here you could roll your own which would e.g also consider
// fibers that haven't had a chance to run for too long, or always schedule a particular fiber regardless
// of priorities first, etc. Most likely should be made virtual so
// that other scheduler implementations can do what's needed
fiber *next_ready();
public:
inline auto current() const noexcept
{
return cur;
}
Scheduler()
{
#ifdef HAVE_FIBER_PRIOS
for (auto &it : ready)
switch_dlist_init(&it);
#else
switch_dlist_init(&ready);
#endif
}
~Scheduler();
template <class F>
void schedule(F &&l, const uint8_t prio = 4)
{
auto *const f = get_fiber();
Drequire(f);
f->L = std::move(l);
// We are using our own trampoline, just so that we we 'll save one un-necessary call
// from the standard coroInit trampoline to another. We also don't need to provide a func.ptr
coro_init_with_trampoline(&f->ctx, nullptr, f, &f->stack, FiberTrampoline);
f->awaits = nullptr;
#ifdef HAVE_FIBER_PRIOS
f->prio = prio;
#endif
make_ready(f);
}
// Puts current fiber to sleep.
// Something will need to wake it up later. Typically, you 'd obtain curent active fiber (e.g scheduler.Cur() call)
// and have something later invoke scheduler.Resume() on it, so that it can be rescheduled.
//
// For example, if you are about to perform some expensive I/O op, you may want to do it off thread and then resume when
// you are done, e.g
// {
// auto cur = scheduler.current();
//
// WorkOffThread(cur);
// suspend();
// }
void suspend();
void resume(fiber *);
// Yield control to another runnable fiber, but also make this current fiber runnable
// so that it will get a chance to run again as soon as possible based on the scheduler semantics.
void yield();
// Convenience method. Schedule a fiber and immediately yield
template <class F>
void yield(F &&f, const uint8_t prio = 4)
{
schedule(std::move(f), prio);
yield();
}
// This should be invoked in the execution context of a 'scheduler', that is, not in the context
// of a fiber (though a scheduler can of course be implemented as a fiber).
// Control will be transfered to the next runnable fiber.
void yield_from_main(coro_ctx *const ctx);
void yield_from_main();
// Make `f` ready, and schedule the next runnable
// It will suspend the current fiber and will then schedule the next runnable. Once
// the scheduled `f` completes, the now suspended fiber will be made runnable again and potentially immediately rescheduled
//
// See "An Introduction to C++ Coroutines by James McNellis" https://youtu.be/YYtzQ355_Co?t=770
// where he is demonstrating that
// {
// auto result = co_await expression;
// }
// is translated to
// {
// auto &&__a = expression;
//
// if (!__a.await_ready())
// {
// __a.await_suspend(coroutine-handle);
// // .. suspend/resume points ..
// }
// auto result = __a.await_resume();
// }
//
//
// Conversely, co_return y is translated to ( see https://youtu.be/YYtzQ355_Co?t=2727 )
// {
// __promise.return_value(x);
// goto __final_suspend_label;
// }
// and co_yield z to
// {
// co_await __promise.yield_value(z);
// }
void await_fiber(fiber *const f);
template <class T>
void await(T &&l, const uint8_t prio = 4)
{
auto *const f = get_fiber();
#ifdef HAVE_FIBER_PRIOS
f->prio = prio;
#endif
f->L = std::move(l);
await_fiber(f);
}
// Whenever a fiber function returns(i.e fiber has exited)
// this is invoked.
void on_fiber_exit();
};
extern thread_local coro_ctx mainCoro;
extern thread_local Scheduler scheduler;
}