Skip to content

Commit cbddf46

Browse files
committed
Get mmap() working with WIN32 MSVC
- We have pretty high quality POSIX polyfills now - We no longer need to override malloc() Tracked by issue #91 Improves upon #341
1 parent e488168 commit cbddf46

File tree

8 files changed

+699
-750
lines changed

8 files changed

+699
-750
lines changed

.gitignore

-407
Large diffs are not rendered by default.

CMakeLists.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,9 @@ endif()
107107
add_executable(llama
108108
main.cpp
109109
utils.cpp
110-
utils.h)
110+
utils.h
111+
mmap.c
112+
mmap.h)
111113

112114
add_executable(quantize
113115
quantize.cpp

Makefile

+7-4
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ endif
3030
# Compile flags
3131
#
3232

33-
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -g -fno-omit-frame-pointer
34-
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -g -fno-omit-frame-pointer
33+
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34+
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
3535
LDFLAGS =
3636

3737
# OS specific
@@ -185,14 +185,17 @@ default: main quantize
185185
ggml.o: ggml.c ggml.h
186186
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
187187

188+
mmap.o: mmap.c mmap.h
189+
$(CC) $(CFLAGS) -c mmap.c -o mmap.o
190+
188191
utils.o: utils.cpp utils.h
189192
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
190193

191194
clean:
192195
rm -f *.o main quantize
193196

194-
main: main.cpp ggml.o utils.o
195-
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
197+
main: main.cpp ggml.o utils.o mmap.o
198+
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o mmap.o -o main $(LDFLAGS)
196199
./main -h
197200

198201
quantize: quantize.cpp ggml.o utils.o

ggml.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -2437,7 +2437,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
24372437

24382438
*ctx = (struct ggml_context) {
24392439
/*.mem_size =*/ params.mem_size,
2440-
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : _malloc(params.mem_size),
2440+
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
24412441
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
24422442
/*.n_objects =*/ 0,
24432443
/*.objects_begin =*/ NULL,
@@ -2469,7 +2469,7 @@ void ggml_free(struct ggml_context * ctx) {
24692469
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
24702470

24712471
if (ctx->mem_buffer_owned) {
2472-
_free(ctx->mem_buffer);
2472+
free(ctx->mem_buffer);
24732473
}
24742474

24752475
found = true;

ggml.h

-3
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,6 @@ extern "C" {
183183
#define GGML_MAX_CONTEXTS 64
184184
#define GGML_MAX_OPT 4
185185

186-
void* _malloc(size_t n);
187-
void _free(void* p);
188-
189186
#ifdef __ARM_NEON
190187
// we use the built-in 16-bit float type
191188
typedef __fp16 ggml_fp16_t;

main.cpp

+70-102
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <cassert>
1111
#include <cmath>
1212
#include <cstdio>
13+
#include <cerrno>
1314
#include <cstring>
1415
#include <fstream>
1516
#include <map>
@@ -23,18 +24,14 @@
2324
#include <unistd.h>
2425
#include <sys/mman.h>
2526
#include <sys/stat.h>
26-
#else
27-
#include <errno.h>
28-
#define msync(addr, len_bytes, flag) winMSync
29-
#define MS_ASYNC 0
3027
#endif
3128

3229
#define ROUNDUP(X, K) (((X) + (K)-1) & -(K))
3330
#define IS2POW(X) (!((X) & ((X)-1)))
3431

3532
#define MAGIC_PATH "magic.dat"
3633
#define MAGIC_ADDR (char *)0x330000000000
37-
#define MAGIC_GRAN 2097152
34+
#define MAGIC_GRAN 65536
3835
#define MAGIC_ALGN (sizeof(size_t) * 2)
3936

4037
#define ANSI_COLOR_RED "\x1b[31m"
@@ -104,49 +101,21 @@ struct llama_model {
104101
std::map<std::string, struct ggml_tensor *> tensors;
105102
};
106103

107-
108104
struct magic {
109105
uint32_t magic;
110106
std::atomic<unsigned> lock;
111107
int fd;
112-
size_t commit;
113-
size_t offset;
114-
size_t capacity;
115-
gpt_vocab* vocab;
116-
llama_model* model;
108+
uint64_t commit;
109+
uint64_t offset;
110+
uint64_t capacity;
111+
gpt_vocab *vocab;
112+
llama_model *model;
117113
};
118114

119-
static void winMSync(magic* addr, size_t len_bytes) {
120-
bool success = FlushViewOfFile((void*)addr, len_bytes);
121-
if (!success) {
122-
LPVOID lpMsgBuf;
123-
LPVOID lpDisplayBuf;
124-
DWORD error_code = GetLastError();
125-
FormatMessage(
126-
FORMAT_MESSAGE_ALLOCATE_BUFFER |
127-
FORMAT_MESSAGE_FROM_SYSTEM |
128-
FORMAT_MESSAGE_IGNORE_INSERTS,
129-
NULL,
130-
error_code,
131-
MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
132-
(LPTSTR)&lpMsgBuf,
133-
0, NULL);
134-
lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
135-
(lstrlen((LPCTSTR)lpMsgBuf) + 40) * sizeof(TCHAR));
136-
StringCchPrintf((LPTSTR)lpDisplayBuf,
137-
LocalSize(lpDisplayBuf) / sizeof(TCHAR),
138-
TEXT("failed with error %d: %s"),
139-
error_code, lpMsgBuf);
140-
}
141-
HANDLE hFile = (HANDLE)_get_osfhandle(addr->fd);
142-
FlushFileBuffers(hFile);
143-
}
144-
145-
146115
static struct magic *mag;
147116

148117
static inline void spin_lock(std::atomic<unsigned> &lock) {
149-
while (!lock.exchange(1, std::memory_order_acquire));
118+
while (lock.exchange(1, std::memory_order_acquire));
150119
}
151120

152121
static inline void spin_unlock(std::atomic<unsigned> &lock) {
@@ -162,62 +131,64 @@ static void *Mmap(void *addr, size_t length, int prot, int flags, int fd, off_t
162131
}
163132

164133
static void magic_commit(void) {
165-
mag->offset = mag->capacity;
166-
mag->commit = mag->capacity;
134+
mag->commit = ROUNDUP(mag->offset, MAGIC_GRAN);
167135
mag->magic = 0xFEEDABEE;
168-
bool success = msync(mag, mag->commit, MS_ASYNC);
136+
if (msync(mag, mag->commit, MS_ASYNC) == -1) {
137+
perror("msync");
138+
exit(77);
139+
}
169140
}
170141

171142
static void magic_init(void) {
172143
int fd;
173144
size_t n;
174-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
175-
struct stat st;
176-
#else
177-
struct _stat64 st;
178-
#endif
145+
int64_t size;
179146
if (mag) return;
180147
n = ROUNDUP(sizeof(struct magic), MAGIC_GRAN);
181148
if ((fd = open(MAGIC_PATH, O_RDWR)) != -1) {
182-
int result = fstat(fd, &st);
183-
int error = errno;
184-
if (errno == EBADF)
185-
fprintf(stderr, "Bad file descriptor.\n");
186-
else if (errno == EINVAL)
187-
fprintf(stderr, "Invalid argument to _fstat.\n");
188-
if (st.st_size >= n) {
149+
if ((size = lseek(fd, 0, SEEK_END)) == -1) {
150+
perror("lseek");
151+
exit(77);
152+
}
153+
if (size >= n) {
189154
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
190155
PROT_READ | PROT_WRITE,
191156
MAP_PRIVATE | MAP_FIXED, fd, 0);
192157
if (mag->magic == 0xFEEDABEE) {
193-
mag = (struct magic *)Mmap(MAGIC_ADDR, mag->capacity,
158+
mag = (struct magic *)Mmap(MAGIC_ADDR, mag->commit,
194159
PROT_READ | PROT_WRITE,
195160
MAP_PRIVATE | MAP_FIXED, fd, 0);
196161
madvise(MAGIC_ADDR, mag->capacity, MADV_WILLNEED);
197-
ftruncate(fd, mag->commit);
198162
mag->offset = mag->commit;
199163
mag->capacity = mag->commit;
200164
mag->fd = -1;
201165
return;
202166
}
203167
}
204-
ftruncate(fd, 0);
168+
if (ftruncate(fd, 0) == -1) {
169+
perror("ftruncate");
170+
exit(77);
171+
}
205172
} else if ((fd = open(MAGIC_PATH, O_RDWR | O_CREAT | O_TRUNC, 0644)) == -1) {
206173
perror(MAGIC_PATH);
207174
exit(77);
208175
}
209-
ftruncate(fd, n);
176+
if (ftruncate(fd, n) == -1) {
177+
perror("ftruncate");
178+
exit(77);
179+
}
210180
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
211181
PROT_READ | PROT_WRITE,
212182
MAP_SHARED | MAP_FIXED, fd, 0);
213-
mag->offset = MAGIC_GRAN;
183+
mag->offset = n;
184+
mag->capacity = n;
214185
mag->fd = fd;
215186
}
216187

217-
void *memalign(size_t a, size_t n) {
188+
void *magic_memalign(size_t a, size_t n) {
218189
void *p;
219-
size_t i, j, k, m;
220190
static int count;
191+
size_t i, j, k, m, c2;
221192
magic_init();
222193
if (a < MAGIC_ALGN) a = MAGIC_ALGN;
223194
while (!IS2POW(a)) ++a;
@@ -227,85 +198,82 @@ void *memalign(size_t a, size_t n) {
227198
i = i + sizeof(size_t);
228199
i = ROUNDUP(i, a);
229200
j = ROUNDUP(i + m, MAGIC_GRAN);
230-
//if (j > mag->capacity) {
201+
if (j > mag->capacity) {
202+
c2 = mag->capacity;
203+
if (!c2) {
204+
c2 = MAGIC_GRAN;
205+
}
206+
while (j > c2) {
207+
c2 += c2 >> 4;
208+
c2 = ROUNDUP(c2, MAGIC_GRAN);
209+
}
231210
if (!mag->magic) {
232-
int result = ftruncate(mag->fd, j);
211+
if (ftruncate(mag->fd, c2) == -1) {
212+
perror("ftruncate");
213+
spin_unlock(mag->lock);
214+
return 0;
215+
}
233216
p = mmap(MAGIC_ADDR + mag->capacity,
234-
j - mag->capacity, PROT_READ | PROT_WRITE,
217+
c2 - mag->capacity, PROT_READ | PROT_WRITE,
235218
MAP_SHARED | MAP_FIXED, mag->fd, mag->capacity);
236219
} else {
237220
p = mmap(MAGIC_ADDR + mag->capacity,
238-
j - mag->capacity, PROT_READ | PROT_WRITE,
239-
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
221+
c2 - mag->capacity, PROT_READ | PROT_WRITE,
222+
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
240223
}
241224
if (p != MAP_FAILED) {
242-
mag->capacity = j;
225+
mag->capacity = c2;
243226
} else {
227+
perror("mmap");
244228
spin_unlock(mag->lock);
245229
return 0;
246230
}
247-
//}
231+
}
248232
mag->offset = i + m;
249233
spin_unlock(mag->lock);
250234
p = MAGIC_ADDR + i;
251235
((size_t *)p)[-1] = n;
252236
return p;
253237
}
254238

255-
void *_malloc(size_t n) {
256-
return memalign(MAGIC_ALGN, n);
257-
}
258-
259-
size_t malloc_usable_size(const void *p) {
260-
return ((const size_t *)p)[-1];
239+
void *magic_malloc(size_t n) {
240+
return magic_memalign(MAGIC_ALGN, n);
261241
}
262242

263-
void *_calloc(size_t n, size_t z) {
243+
void *magic_calloc(size_t n, size_t z) {
264244
void *p;
265-
if ((p = _malloc((n *= z)))) {
245+
if ((p = magic_malloc((n *= z)))) {
266246
memset(p, 0, n);
267247
}
268248
return p;
269249
}
270250

271-
void _free(void *p) {
251+
void magic_free(void *p) {
272252
// do nothing
273253
}
274254

275-
void *_realloc(void *p, size_t n) {
255+
void *magic_realloc(void *p, size_t n) {
276256
void *q;
277257
if (!p) {
278-
return _malloc(n);
258+
return magic_malloc(n);
279259
}
280260
if (!n) {
281-
_free(p);
261+
magic_free(p);
282262
return 0;
283263
}
284-
if ((q = _malloc(n))) {
264+
if ((q = magic_malloc(n))) {
285265
memcpy(q, p, ((const size_t *)p)[-1]);
286266
}
287267
return q;
288268
}
289269

290-
#if defined(malloc)
291-
# undef malloc
292-
#endif
293-
#define malloc(x) _malloc(x)
294-
295-
#if defined(calloc)
296-
# undef calloc
297-
#endif
298-
#define calloc(x) _calloc(x)
299-
300-
#if defined(realloc)
301-
# undef realloc
302-
#endif
303-
#define realloc(x) _realloc(x)
270+
void* operator new(size_t size) {
271+
return magic_malloc(size);
272+
}
304273

305-
#if defined(free)
306-
# undef free
307-
#endif
308-
#define free(x) _free(x)
274+
void operator delete(void* p) {
275+
magic_free(p);
276+
}
309277

310278
// load the model's weights from a file
311279
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
@@ -451,7 +419,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
451419
{
452420
struct ggml_init_params params = {
453421
/*.mem_size =*/ ctx_size,
454-
/*.mem_buffer =*/ NULL,
422+
/*.mem_buffer =*/ magic_malloc(ctx_size),
455423
};
456424

457425
model.ctx = ggml_init(params);
@@ -772,15 +740,15 @@ bool llama_eval(
772740
const int d_key = n_embd/n_head;
773741

774742
static size_t buf_size = 512u*1024*1024;
775-
static void * buf = _malloc(buf_size);
743+
static void * buf = malloc(buf_size);
776744

777745
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
778746
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
779747
//fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
780748

781749
// reallocate
782750
buf_size = buf_size_new;
783-
buf = _realloc(buf, buf_size);
751+
buf = realloc(buf, buf_size);
784752
if (buf == nullptr) {
785753
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
786754
return false;

0 commit comments

Comments
 (0)