Skip to content

Commit 48d84c1

Browse files
committed
[OPTIM] linux: add support for bypassing libc to force using vsyscalls
Some distros' libc are built for CPUs earlier than i686 and as such do not offer support for Linux kernel's faster vsyscalls. This code adds a new build option USE_VSYSCALLS to bypass libc for most commonly used system calls. A net gain of about 10% can be observed with this change alone. It only works when /proc/sys/abi/vsyscall32 equals exactly 2. When it's set to 1, the VDSO is randomized and cannot be used.
1 parent b695a6e commit 48d84c1

File tree

3 files changed

+216
-1
lines changed

3 files changed

+216
-1
lines changed

Makefile

+7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
# USE_LINUX_SPLICE : enable kernel 2.6 splicing (broken on old kernels)
2323
# USE_LIBCRYPT : enable crypted passwords using -lcrypt
2424
# USE_CRYPT_H : set it if your system requires including crypt.h
25+
# USE_VSYSCALL : enable vsyscall on Linux x86, bypassing libc
2526
#
2627
# Options can be forced by specifying "USE_xxx=1" or can be disabled by using
2728
# "USE_xxx=" (empty string).
@@ -374,6 +375,12 @@ OPTIONS_OBJS += src/ev_kqueue.o
374375
BUILD_OPTIONS += $(call ignore_implicit,USE_KQUEUE)
375376
endif
376377
378+
ifneq ($(USE_VSYSCALL),)
379+
OPTIONS_OBJS += src/i386-linux-vsys.o
380+
OPTIONS_CFLAGS += -DCONFIG_HAP_LINUX_VSYSCALL
381+
BUILD_OPTIONS += $(call ignore_implicit,USE_VSYSCALL)
382+
endif
383+
377384
ifneq ($(USE_NETFILTER),)
378385
OPTIONS_CFLAGS += -DNETFILTER
379386
BUILD_OPTIONS += $(call ignore_implicit,USE_NETFILTER)

src/i386-linux-vsys.c

+204
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
/*
2+
* Fast system call support for x86 on Linux
3+
*
4+
* Copyright 2010 Willy Tarreau <[email protected]>
5+
*
6+
* This program is free software; you can redistribute it and/or
7+
* modify it under the terms of the GNU General Public License
8+
* as published by the Free Software Foundation; either version
9+
* 2 of the License, or (at your option) any later version.
10+
*
11+
* Recent kernels support a faster syscall ABI on x86 using the VDSO page, but
12+
* some libc that are built for CPUs earlier than i686 do not implement it.
13+
* This code bypasses the libc when the VDSO is detected. It should only be
14+
* used when it's sure that the libc really does not support the VDSO, but
15+
* fixing the libc is preferred. Using the VDSO can improve the overall
16+
* performance by about 10%.
17+
*/
18+
19+
#if defined(__linux__) && defined(__i386__)
20+
/* Silently ignore other platforms to be friendly with distro packagers */
21+
22+
#include <dlfcn.h>
23+
#include <sys/mman.h>
24+
25+
void int80(void); /* declared in the assembler code */
26+
static void *vsyscall = &int80; /* initialize vsyscall to use int80 by default */
27+
static __attribute__((used)) unsigned int back_ebx;
28+
29+
/* now we redefine some frequently used syscalls. Epoll_create is defined too
30+
* in order to replace old disabled implementations.
31+
*/
32+
asm
33+
(
34+
"epoll_create: .GLOBL epoll_create\n"
35+
" mov $0xfe, %eax\n"
36+
" mov %ebx, back_ebx\n"
37+
" mov 4(%esp), %ebx\n"
38+
" jmp do_syscall\n"
39+
40+
"epoll_ctl: .GLOBL epoll_ctl\n"
41+
" push %esi\n"
42+
" mov $0xff, %eax\n"
43+
" mov %ebx, back_ebx\n"
44+
" mov 20(%esp), %esi\n"
45+
" mov 16(%esp), %edx\n"
46+
" mov 12(%esp), %ecx\n"
47+
" mov 8(%esp), %ebx\n"
48+
" call do_syscall\n"
49+
" pop %esi\n"
50+
" ret\n"
51+
52+
"epoll_wait: .GLOBL epoll_wait\n"
53+
" push %esi\n"
54+
" mov $0x100, %eax\n"
55+
" mov %ebx, back_ebx\n"
56+
" mov 20(%esp), %esi\n"
57+
" mov 16(%esp), %edx\n"
58+
" mov 12(%esp), %ecx\n"
59+
" mov 8(%esp), %ebx\n"
60+
" call do_syscall\n"
61+
" pop %esi\n"
62+
" ret\n"
63+
64+
"splice: .GLOBL splice\n"
65+
" push %ebp\n"
66+
" push %edi\n"
67+
" push %esi\n"
68+
" mov $0x139, %eax\n"
69+
" mov %ebx, back_ebx\n"
70+
" mov 36(%esp), %ebp\n"
71+
" mov 32(%esp), %edi\n"
72+
" mov 28(%esp), %esi\n"
73+
" mov 24(%esp), %edx\n"
74+
" mov 20(%esp), %ecx\n"
75+
" mov 16(%esp), %ebx\n"
76+
" call do_syscall\n"
77+
" pop %esi\n"
78+
" pop %edi\n"
79+
" pop %ebp\n"
80+
" ret\n"
81+
82+
"close: .GLOBL close\n"
83+
" mov $0x06, %eax\n"
84+
" mov %ebx, back_ebx\n"
85+
" mov 4(%esp), %ebx\n"
86+
" jmp do_syscall\n"
87+
88+
"gettimeofday: .GLOBL gettimeofday\n"
89+
" mov $0x4e, %eax\n"
90+
" mov %ebx, back_ebx\n"
91+
" mov 8(%esp), %ecx\n"
92+
" mov 4(%esp), %ebx\n"
93+
" jmp do_syscall\n"
94+
95+
"fcntl: .GLOBL fcntl\n"
96+
" mov $0xdd, %eax\n"
97+
" mov %ebx, back_ebx\n"
98+
" mov 12(%esp), %edx\n"
99+
" mov 8(%esp), %ecx\n"
100+
" mov 4(%esp), %ebx\n"
101+
" jmp do_syscall\n"
102+
103+
"socket: .GLOBL socket\n"
104+
" mov $0x01, %eax\n"
105+
" jmp socketcall\n"
106+
107+
"bind: .GLOBL bind\n"
108+
" mov $0x02, %eax\n"
109+
" jmp socketcall\n"
110+
111+
"connect: .GLOBL connect\n"
112+
" mov $0x03, %eax\n"
113+
" jmp socketcall\n"
114+
115+
"listen: .GLOBL listen\n"
116+
" mov $0x04, %eax\n"
117+
" jmp socketcall\n"
118+
119+
"accept: .GLOBL accept\n"
120+
" mov $0x05, %eax\n"
121+
" jmp socketcall\n"
122+
123+
"getsockname: .GLOBL getsockname\n"
124+
" mov $0x06, %eax\n"
125+
" jmp socketcall\n"
126+
127+
"send: .GLOBL send\n"
128+
" mov $0x09, %eax\n"
129+
" jmp socketcall\n"
130+
131+
"recv: .GLOBL recv\n"
132+
" mov $0x0a, %eax\n"
133+
" jmp socketcall\n"
134+
135+
"shutdown: .GLOBL shutdown\n"
136+
" mov $0x0d, %eax\n"
137+
" jmp socketcall\n"
138+
139+
"setsockopt: .GLOBL setsockopt\n"
140+
" mov $0x0e, %eax\n"
141+
" jmp socketcall\n"
142+
143+
"getsockopt: .GLOBL getsockopt\n"
144+
" mov $0x0f, %eax\n"
145+
" jmp socketcall\n"
146+
147+
"socketcall:\n"
148+
" mov %ebx, back_ebx\n"
149+
" mov %eax, %ebx\n"
150+
" mov $0x66, %eax\n"
151+
" lea 4(%esp), %ecx\n"
152+
/* fall through */
153+
154+
"do_syscall:\n"
155+
" call *vsyscall\n" // always valid, may be int80 or vsyscall
156+
" mov back_ebx, %ebx\n"
157+
" cmpl $0xfffff000, %eax\n" // consider -4096..-1 for errno
158+
" jae 0f\n"
159+
" ret\n"
160+
"0:\n" // error handling
161+
" neg %eax\n" // get errno value
162+
" push %eax\n" // save it
163+
" call __errno_location\n"
164+
" popl (%eax)\n" // store the pushed errno into the proper location
165+
" mov $-1, %eax\n" // and return -1
166+
" ret\n"
167+
168+
"int80:\n" // default compatible calling convention
169+
" int $0x80\n"
170+
" ret\n"
171+
);
172+
173+
__attribute__((constructor))
174+
static void __i386_linux_vsyscall_init(void)
175+
{
176+
/* We can get the pointer by resolving the __kernel_vsyscall symbol
177+
* from the "linux-gate.so.1" virtual shared object, but this requires
178+
* libdl. Or we can also know that the vsyscall pointer is always
179+
* located at 0xFFFFE018 when /proc/sys/abi/vsyscall32 contains the
180+
* default value 2. So we can use that once we've checked that we can
181+
* access it without faulting. The dlsym method will also work when
182+
* vsyscall32 = 1, which randomizes the VDSO address.
183+
*/
184+
#ifdef USE_VSYSCALL_DLSYM
185+
void *handle = dlopen("linux-gate.so.1", RTLD_NOW);
186+
if (handle) {
187+
void *ptr = dlsym(handle, "__kernel_vsyscall");
188+
dlclose(handle);
189+
if (ptr)
190+
vsyscall = ptr;
191+
}
192+
#else
193+
/* Heuristic: trying to mprotect() the VDSO area will only succeed if
194+
* it is mapped.
195+
*/
196+
if (mprotect((void *)0xffffe000, 4096, PROT_READ|PROT_EXEC) == 0) {
197+
unsigned long ptr = *(unsigned long *)0xFFFFE018; /* VDSO is mapped */
198+
if ((ptr & 0xFFFFE000) == 0xFFFFE000)
199+
vsyscall = (void *)ptr;
200+
}
201+
#endif
202+
}
203+
204+
#endif /* defined(__linux__) && defined(__i386__) */

src/stream_sock.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,12 @@
7777
#define __NR_splice 313
7878
#endif /* $arch */
7979

80+
#if defined(CONFIG_HAP_LINUX_VSYSCALL) && defined(__linux__) && defined(__i386__)
81+
/* the syscall is redefined somewhere else */
82+
extern int splice(int fdin, loff_t *off_in, int fdout, loff_t *off_out, size_t len, unsigned long flags);
83+
#else
8084
_syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags)
81-
85+
#endif
8286
#endif /* __NR_splice */
8387

8488
/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes

0 commit comments

Comments
 (0)