From a568e5bc416b601b734e6c6815dce12b0901c0e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Marczewski?= Date: Fri, 25 Jun 2021 11:58:35 +0200 Subject: [PATCH] [LibOS] Implement POSIX locks (fcntl) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Paweł Marczewski --- LibOS/shim/include/shim_fs.h | 9 + LibOS/shim/include/shim_fs_lock.h | 104 ++++ LibOS/shim/include/shim_ipc.h | 38 ++ LibOS/shim/src/bookkeep/shim_handle.c | 17 + LibOS/shim/src/fs/shim_dcache.c | 2 + LibOS/shim/src/fs/shim_fs_lock.c | 610 +++++++++++++++++++++++ LibOS/shim/src/ipc/shim_ipc_fs_lock.c | 155 ++++++ LibOS/shim/src/ipc/shim_ipc_worker.c | 4 + LibOS/shim/src/meson.build | 2 + LibOS/shim/src/shim_init.c | 2 + LibOS/shim/src/sys/shim_exit.c | 9 +- LibOS/shim/src/sys/shim_fcntl.c | 158 +++++- LibOS/shim/test/ltp/ltp.cfg | 112 +---- LibOS/shim/test/regression/.gitignore | 1 + LibOS/shim/test/regression/Makefile | 1 + LibOS/shim/test/regression/fcntl_lock.c | 353 +++++++++++++ LibOS/shim/test/regression/test_libos.py | 4 + 17 files changed, 1474 insertions(+), 107 deletions(-) create mode 100644 LibOS/shim/include/shim_fs_lock.h create mode 100644 LibOS/shim/src/fs/shim_fs_lock.c create mode 100644 LibOS/shim/src/ipc/shim_ipc_fs_lock.c create mode 100644 LibOS/shim/test/regression/fcntl_lock.c diff --git a/LibOS/shim/include/shim_fs.h b/LibOS/shim/include/shim_fs.h index 91d7fd3ef4..f1fa023674 100644 --- a/LibOS/shim/include/shim_fs.h +++ b/LibOS/shim/include/shim_fs.h @@ -115,6 +115,8 @@ struct shim_fs_ops { * pretends to have many files in a directory. */ #define DENTRY_MAX_CHILDREN 1000000 +struct fs_lock_info; + DEFINE_LIST(shim_dentry); DEFINE_LISTP(shim_dentry); struct shim_dentry { @@ -152,6 +154,13 @@ struct shim_dentry { /* Filesystem-specific data. Protected by `lock`. */ void* data; + /* File lock information, stored in the main process. See shim_fs_lock.c. */ + struct fs_lock* fs_lock; + + /* True if the file might have locks placed by current process. Used in processes other than + * main process, to prevent unnecessary IPC calls on handle close. See shim_fs_lock.c. */ + bool maybe_has_locks; + struct shim_lock lock; REFTYPE ref_count; }; diff --git a/LibOS/shim/include/shim_fs_lock.h b/LibOS/shim/include/shim_fs_lock.h new file mode 100644 index 0000000000..5d41884a05 --- /dev/null +++ b/LibOS/shim/include/shim_fs_lock.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2021 Intel Corporation + * Paweł Marczewski + */ + +/* + * File locks. Currently POSIX locks are implemented. + */ + +#ifndef SHIM_FS_LOCK_H_ +#define SHIM_FS_LOCK_H_ + + +#include + +#include "list.h" +#include "shim_types.h" + +#define FS_LOCK_EOF ((uint64_t)-1) + +struct shim_dentry; + +/* Initialize the file locking subsystem. */ +int init_fs_lock(void); + +/* + * POSIX locks (also known as advisory record locks). See `man fcntl` for details. + * + * The current implementation works over IPC and handles all requests in the main process. It has + * the following caveats: + * + * - Lock requests from other processes will always have the overhead of IPC round-trip, even if the + * lock is uncontested. + * - The main process has to be able to look up the same file, so locking will not work for files in + * local-process-only filesystems (tmpfs). + * - There is no deadlock detection (EDEADLK). + * - The lock requests cannot be interrupted (EINTR). + * - The locks work only on files that have a dentry (no pipes, sockets etc.) + */ + +DEFINE_LISTP(posix_lock); +DEFINE_LIST(posix_lock); +struct posix_lock { + /* Lock type: F_RDLCK, F_WRLCK, F_UNLCK */ + int type; + + /* First byte of range */ + uint64_t start; + + /* Last byte of range (use FS_LOCK_EOF for a range until end of file) */ + uint64_t end; + + /* PID of process taking the lock */ + IDTYPE pid; + + /* List node, used internally */ + LIST_TYPE(posix_lock) list; +}; + +/*! + * \brief Set or remove a lock on a file + * + * \param dent the dentry for a file + * \param pl parameters of new lock + * \param wait if true, will wait until a lock can be taken + * + * This is the equivalent of `fnctl(F_SETLK/F_SETLKW)`. + * + * If `pl->type` is `F_UNLCK`, the function will remove any locks held by the given PID for the + * given range. Removing a locks never waits. + * + * If `pl->type` is `F_RDLCK` or `F_WRLCK`, the function will create a new lock for the given PID + * and range, replacing the existing locks held by the given PID for that range. If there are + * conflicting locks, the function either waits (if `wait` is true), or fails with `-EAGAIN` (if + * `wait` is false). + */ +int posix_lock_set(struct shim_dentry* dent, struct posix_lock* pl, bool wait); + +/*! + * \brief Check for conflicting locks on a file + * + * \param dent the dentry for a file + * \param pl parameters of new lock (type cannot be `F_UNLCK`) + * \param[out] out_pl on success, set to `F_UNLCK` or details of a conflicting lock + * + * This is the equivalent of `fcntl(F_GETLK)`. + * + * The function checks if there are locks by other PIDs preventing the proposed lock from being + * placed. If the lock could be placed, `out_pl->type` is set to `F_UNLCK`. Otherwise, `out_pl` + * fields (`type`, `start, `end`, `pid`) are set to details of a conflicting lock. + */ +int posix_lock_get(struct shim_dentry* dent, struct posix_lock* pl, struct posix_lock* out_pl); + +/* Removes all locks for a given PID. Should be called before process exit. */ +int posix_lock_clear_pid(IDTYPE pid); + +/* Version of `posix_lock_set` called from IPC callback. */ +int posix_lock_set_from_ipc(const char* path, struct posix_lock* pl, bool wait, IDTYPE vmid, + unsigned long seq, bool* postponed); + +/* Version of `posix_lock_get` called from IPC callback. */ +int posix_lock_get_from_ipc(const char* path, struct posix_lock* pl, struct posix_lock* out_pl); + +#endif /* SHIM_FS_LOCK_H */ diff --git a/LibOS/shim/include/shim_ipc.h b/LibOS/shim/include/shim_ipc.h index 98dea12f4a..43f9deb55e 100644 --- a/LibOS/shim/include/shim_ipc.h +++ b/LibOS/shim/include/shim_ipc.h @@ -32,6 +32,9 @@ enum { IPC_MSG_SYNC_CONFIRM_UPGRADE, IPC_MSG_SYNC_CONFIRM_DOWNGRADE, IPC_MSG_SYNC_CONFIRM_CLOSE, + IPC_MSG_POSIX_LOCK_SET, + IPC_MSG_POSIX_LOCK_GET, + IPC_MSG_POSIX_LOCK_CLEAR_PID, IPC_MSG_CODE_BOUND, }; @@ -253,4 +256,39 @@ int ipc_sync_confirm_upgrade_callback(IDTYPE src, void* data, unsigned long seq) int ipc_sync_confirm_downgrade_callback(IDTYPE src, void* data, unsigned long seq); int ipc_sync_confirm_close_callback(IDTYPE src, void* data, unsigned long seq); +/* + * POSIX_LOCK_SET: `struct shim_ipc_posix_lock` -> `int` + * POSIX_LOCK_GET: `struct shim_ipc_posix_lock` -> `struct shim_ipc_posix_lock_resp` + * POSIX_LOCK_CLEAR_PID: `IDTYPE` -> `int` + */ + +struct shim_ipc_posix_lock { + int type; + uint64_t start; + uint64_t end; + IDTYPE pid; + + bool wait; + char path[]; /* null-terminated */ +}; + +struct shim_ipc_posix_lock_resp { + int result; + + int type; + uint64_t start; + uint64_t end; + IDTYPE pid; +}; + +struct posix_lock; + +int ipc_posix_lock_set(const char* path, struct posix_lock* pl, bool wait); +int ipc_posix_lock_set_send_response(IDTYPE vmid, unsigned long seq, int result); +int ipc_posix_lock_get(const char* path, struct posix_lock* pl, struct posix_lock* out_pl); +int ipc_posix_lock_clear_pid(IDTYPE pid); +int ipc_posix_lock_set_callback(IDTYPE src, void* data, unsigned long seq); +int ipc_posix_lock_get_callback(IDTYPE src, void* data, unsigned long seq); +int ipc_posix_lock_clear_pid_callback(IDTYPE src, void* data, unsigned long seq); + #endif /* SHIM_IPC_H_ */ diff --git a/LibOS/shim/src/bookkeep/shim_handle.c b/LibOS/shim/src/bookkeep/shim_handle.c index f72b438021..2106871427 100644 --- a/LibOS/shim/src/bookkeep/shim_handle.c +++ b/LibOS/shim/src/bookkeep/shim_handle.c @@ -9,6 +9,7 @@ #include "pal_error.h" #include "shim_checkpoint.h" #include "shim_fs.h" +#include "shim_fs_lock.h" #include "shim_handle.h" #include "shim_internal.h" #include "shim_lock.h" @@ -313,6 +314,22 @@ struct shim_handle* detach_fd_handle(FDTYPE fd, int* flags, struct shim_handle_m handle = __detach_fd_handle(handle_map->map[fd], flags, handle_map); unlock(&handle_map->lock); + + if (handle && handle->dentry) { + /* Clear POSIX locks for a file. We are required to do that every time a FD is closed + * closed, even if the process holds other handles for that file, or duplicated FDs for the + * same handle. */ + struct posix_lock pl = { + .type = F_UNLCK, + .start = 0, + .end = FS_LOCK_EOF, + .pid = g_process.pid, + }; + int ret = posix_lock_set(handle->dentry, &pl, /*block=*/false); + if (ret < 0) + log_warning("error releasing locks: %d\n", ret); + } + return handle; } diff --git a/LibOS/shim/src/fs/shim_dcache.c b/LibOS/shim/src/fs/shim_dcache.c index 8840acc76f..7ac0fb6cd7 100644 --- a/LibOS/shim/src/fs/shim_dcache.c +++ b/LibOS/shim/src/fs/shim_dcache.c @@ -453,6 +453,8 @@ BEGIN_CP_FUNC(dentry) { new_dent->data = NULL; } + new_dent->fs_lock = NULL; + DO_CP_IN_MEMBER(qstr, new_dent, name); if (new_dent->mount) diff --git a/LibOS/shim/src/fs/shim_fs_lock.c b/LibOS/shim/src/fs/shim_fs_lock.c new file mode 100644 index 0000000000..32f288d299 --- /dev/null +++ b/LibOS/shim/src/fs/shim_fs_lock.c @@ -0,0 +1,610 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2021 Intel Corporation + * Paweł Marczewski + */ + +#include + +#include "shim_fs.h" +#include "shim_fs_lock.h" +#include "shim_ipc.h" +#include "shim_lock.h" + +/* Describes a pending request for a POSIX lock, either local on remote. After processing the request, + * the object is removed, and a possible waiter is notified (see below). */ +DEFINE_LISTP(posix_lock_request); +DEFINE_LIST(posix_lock_request); +struct posix_lock_request { + struct posix_lock pl; + + /* For local requests, `event` should be set to an event handle. After processing the request, + * the event will be triggered, and `*result` will be set to the result. */ + PAL_HANDLE event; + int* result; + + /* For remote requests, `vmid` and `seq` should be set to parameters of IPC message. After + * processing the request, IPC response will be sent. */ + IDTYPE vmid; + unsigned int seq; + + LIST_TYPE(posix_lock_request) list; +}; + +/* Describes file lock details for a given dentry. Currently holds only POSIX locks. */ +DEFINE_LISTP(fs_lock); +DEFINE_LIST(fs_lock); +struct fs_lock { + struct shim_dentry* dent; + + /* POSIX locks, sorted by PID and then by start position. The ranges do not overlap within a + * given PID. */ + LISTP_TYPE(posix_lock) posix_locks; + + /* Pending requests. */ + LISTP_TYPE(posix_lock_request) posix_lock_requests; + + LIST_TYPE(fs_lock) list; +}; + +/* Global list of `fs_lock` objects. Used for cleanup. */ +static LISTP_TYPE(fs_lock) g_fs_lock_list = LISTP_INIT; + +/* Global lock for all operations on filesystem locks, including access to dentry `fs_lock` and + * `maybe_has_locks` fields. */ +static struct shim_lock g_fs_lock_lock; + +int init_fs_lock(void) { + if (g_process_ipc_ids.leader_vmid) + return 0; + + return create_lock(&g_fs_lock_lock); +} + +static int find_fs_lock(struct shim_dentry* dent, bool create, struct fs_lock** out_fs_lock) { + assert(locked(&g_fs_lock_lock)); + if (!dent->fs_lock && create) { + struct fs_lock* fs_lock = malloc(sizeof(*fs_lock)); + if (!fs_lock) + return -ENOMEM; + fs_lock->dent = dent; + get_dentry(dent); + INIT_LISTP(&fs_lock->posix_locks); + INIT_LISTP(&fs_lock->posix_lock_requests); + dent->fs_lock = fs_lock; + + LISTP_ADD(fs_lock, &g_fs_lock_list, list); + } + *out_fs_lock = dent->fs_lock; + return 0; +} + +static int posix_lock_dump_write_all(const char* str, size_t size, void* arg) { + __UNUSED(arg); + log_always("posix_lock: %.*s\n", (int)size, str); + return 0; +} + +/* Log current locks for a file, for debugging purposes. */ +static void posix_lock_dump(struct fs_lock* fs_lock) { + assert(locked(&g_fs_lock_lock)); + struct print_buf buf = INIT_PRINT_BUF(&posix_lock_dump_write_all); + IDTYPE pid = 0; + + struct posix_lock* pl; + LISTP_FOR_EACH_ENTRY(pl, &fs_lock->posix_locks, list) { + if (pl->pid != pid) { + if (pid != 0) + buf_flush(&buf); + pid = pl->pid; + buf_printf(&buf, "%d:", pid); + } + + char c; + switch (pl->type) { + case F_RDLCK: c = 'r'; break; + case F_WRLCK: c = 'w'; break; + default: c = '?'; break; + } + if (pl->end == FS_LOCK_EOF) { + buf_printf(&buf, " %c[%lu..end]", c, pl->start); + } else { + buf_printf(&buf, " %c[%lu..%lu]", c, pl->start, pl->end); + } + } + if (LISTP_EMPTY(&fs_lock->posix_locks)) { + buf_printf(&buf, "no locks"); + } + buf_flush(&buf); +} + +/* Removes `fs_lock` if it's not necessary (i.e. no locks are held or requested for a file). */ +static void fs_lock_gc(struct fs_lock* fs_lock) { + assert(locked(&g_fs_lock_lock)); + if (g_log_level >= LOG_LEVEL_DEBUG) + posix_lock_dump(fs_lock); + if (LISTP_EMPTY(&fs_lock->posix_locks) && LISTP_EMPTY(&fs_lock->posix_lock_requests)) { + struct shim_dentry* dent = fs_lock->dent; + dent->fs_lock = NULL; + put_dentry(dent); + + LISTP_DEL(fs_lock, &g_fs_lock_list, list); + free(fs_lock); + } +} + +/* + * Find first lock that conflicts with `pl`. Two locks conflict if they have different PIDs, their + * ranges overlap, and at least one of them is a write lock. + */ +static struct posix_lock* posix_lock_find(struct fs_lock *fs_lock, struct posix_lock* pl) { + assert(locked(&g_fs_lock_lock)); + assert(pl->type != F_UNLCK); + + struct posix_lock* cur; + LISTP_FOR_EACH_ENTRY(cur, &fs_lock->posix_locks, list) { + if (cur->pid != pl->pid && pl->start <= cur->end && cur->start <= pl->end + && (cur->type == F_WRLCK || pl->type == F_WRLCK)) + return cur; + } + return NULL; +} + +/* + * Add a new lock request. Before releasing `g_fs_lock_lock`, the caller has to add notification + * info to the request (see `struct posix_lock_request` above). + */ +static int posix_lock_add_request(struct fs_lock *fs_lock, struct posix_lock* pl, + struct posix_lock_request** out_req) { + assert(locked(&g_fs_lock_lock)); + assert(pl->type != F_UNLCK); + + struct posix_lock_request *req = malloc(sizeof(*req)); + if (!req) + return -ENOMEM; + req->pl = *pl; + req->event = NULL; + LISTP_ADD(req, &fs_lock->posix_lock_requests, list); + *out_req = req; + return 0; +} + +/* + * Main part of `posix_lock_set`. Adds/removes a lock (depending on `pl->type`), assumes we already + * verified there are no conflicts. Replaces existing locks for a given PID, and merges adjacent + * locks if possible. + * + * See also Linux sources (`fs/locks.c`) for a similar implementation. + */ +static int _posix_lock_set(struct fs_lock *fs_lock, struct posix_lock* pl) { + assert(locked(&g_fs_lock_lock)); + + /* Preallocate new locks first, so that we don't fail after modifying something. */ + + /* Lock to be added. Not necessary for F_UNLCK, because we're only removing existing locks. */ + struct posix_lock* new = NULL; + if (pl->type != F_UNLCK) { + new = malloc(sizeof(*new)); + if (!new) + return -ENOMEM; + } + + /* Extra lock that we might need when splitting existing one. */ + struct posix_lock* extra = malloc(sizeof(*extra)); + if (!extra) { + free(new); + return -ENOMEM; + } + + /* Target range: we will be changing it when merging existing locks. */ + uint64_t start = pl->start, end = pl->end; + + /* `prev` will be set to the last lock before target range, so that we add the new lock just + * after `prev`. */ + struct posix_lock* prev = NULL; + + struct posix_lock* cur; + struct posix_lock* tmp; + LISTP_FOR_EACH_ENTRY_SAFE(cur, tmp, &fs_lock->posix_locks, list) { + if (cur->pid < pl->pid) { + prev = cur; + continue; + } + if (pl->pid < cur->pid) { + break; + } + + if (pl->type == cur->type) { + /* Same lock type: we can possibly merge the locks. */ + + if (start > 0 && cur->end < start - 1) { + /* `cur` ends before target range begins, and is not even adjacent */ + prev = cur; + } else if (end < FS_LOCK_EOF && end + 1 < cur->start) { + /* `cur` begins after target range ends, and is not even adjacent - we're + * done */ + break; + } else { + /* `cur` is either adjacent to target range, or overlaps with it. Delete it, and + * expand the target range. */ + start = MIN(start, cur->start); + end = MAX(end, cur->end); + LISTP_DEL(cur, &fs_lock->posix_locks, list); + free(cur); + } + } else { + /* Different lock types: if they overlap, we delete the target range. */ + + if (cur->end < start) { + /* `cur` ends before target range begins */ + prev = cur; + } else if (end < cur->start) { + /* `cur` begins after target range ends - we're done */ + break; + } else if (cur->start < start && cur->end <= end) { + /* + * `cur` overlaps with beginning of target range. Shorten `cur`. + * + * cur: ======= + * tgt: ------- + * + * cur: == + */ + assert(start > 0); + cur->end = start - 1; + prev = cur; + } else if (cur->start < start && cur->end > end) { + /* + * The target range is inside `cur`. Split `cur` and finish. + * + * cur: ======== + * tgt: ---- + * + * cur: == + * extra: == + */ + + /* We'll need `extra` only once, because we exit the loop afterwards. */ + assert(extra); + + assert(start > 0); + extra->type = cur->type; + extra->start = end + 1; + extra->end = cur->end; + extra->pid = cur->pid; + cur->end = start - 1; + LISTP_ADD_AFTER(extra, cur, &fs_lock->posix_locks, list); + extra = NULL; + /* We're done: the new lock, if any, will be added after `cur`. */ + prev = cur; + break; + } else if (start <= cur->start && cur->end <= end) { + /* + * `cur` is completely covered by target range. Delete `cur`. + * + * cur: ==== + * tgt: -------- + */ + LISTP_DEL(cur, &fs_lock->posix_locks, list); + free(cur); + } else { + /* + * `cur` overlaps with end of target range. Shorten `cur` and finish. + * + * cur: ==== + * tgt: ----- + * + * cur: == + */ + assert(start <= cur->start && end < cur->end); + assert (end < FS_LOCK_EOF); + cur->start = end + 1; + break; + } + } + } + + if (new) { + assert(pl->type != F_UNLCK); + + + new->type = pl->type; + new->start = start; + new->end = end; + new->pid = pl->pid; + +#ifdef DEBUG + /* Assert that list order is preserved */ + struct posix_lock* next = prev ? LISTP_NEXT_ENTRY(prev, &fs_lock->posix_locks, list) + : LISTP_FIRST_ENTRY(&fs_lock->posix_locks, struct posix_lock, list); + if (prev) + assert(prev->pid < pl->pid || (prev->pid == pl->pid && prev->end < start)); + if (next) + assert(pl->pid < next->pid || (pl->pid == next->pid && end < next->start)); +#endif + + if (prev) { + LISTP_ADD_AFTER(new, prev, &fs_lock->posix_locks, list); + } else { + LISTP_ADD(new, &fs_lock->posix_locks, list); + } + } + + if (extra) + free(extra); + return 0; +} + +/* + * Process pending requests after modifying the list of locks. + * + * TODO: This is pretty inefficient, but perhaps good enough for now... + */ +static void posix_lock_process_requests(struct fs_lock *fs_lock) { + assert(locked(&g_fs_lock_lock)); + + bool changed; + do { + changed = false; + + struct posix_lock_request* req; + struct posix_lock_request* tmp; + LISTP_FOR_EACH_ENTRY_SAFE(req, tmp, &fs_lock->posix_lock_requests, list) { + struct posix_lock* conflict = posix_lock_find(fs_lock, &req->pl); + if (!conflict) { + int result = _posix_lock_set(fs_lock, &req->pl); + LISTP_DEL(req, &fs_lock->posix_lock_requests, list); + + if (req->event) { + assert(req->result); + *req->result = result; + DkEventSet(req->event); + } + if (req->vmid) { + int ret = ipc_posix_lock_set_send_response(req->vmid, req->seq, result); + if (ret < 0) { + log_warning("posix lock: error sending result over IPC: %d\n", ret); + } + } + free(req); + changed = true; + } + } + } while (changed); +} + +static int posix_lock_set_or_add_request(struct shim_dentry* dent, struct posix_lock* pl, bool wait, + struct posix_lock_request** out_req) { + assert(locked(&g_fs_lock_lock)); + + struct fs_lock* fs_lock = NULL; + int ret = find_fs_lock(dent, /*create=*/pl->type != F_UNLCK, &fs_lock); + if (ret < 0) + goto out; + if (!fs_lock) { + assert(pl->type == F_UNLCK); + /* Nothing to unlock. */ + return 0; + } + + struct posix_lock* conflict = NULL; + if (pl->type != F_UNLCK) + conflict = posix_lock_find(fs_lock, pl); + if (conflict) { + if (!wait) + return -EAGAIN; + + struct posix_lock_request* req; + ret = posix_lock_add_request(fs_lock, pl, &req); + if (ret < 0) + goto out; + + *out_req = req; + } else { + *out_req = NULL; + ret = _posix_lock_set(fs_lock, pl); + if (ret < 0) + goto out; + posix_lock_process_requests(fs_lock); + *out_req = NULL; + } + ret = 0; +out: + if (fs_lock) + fs_lock_gc(fs_lock); + return ret; +} + +int posix_lock_set(struct shim_dentry* dent, struct posix_lock* pl, bool wait) { + int ret; + if(g_process_ipc_ids.leader_vmid) { + /* In the IPC version, we use `dent->maybe_has_locks` to short-circuit unlocking files that + * we never locked. This is to prevent unnecessary IPC calls on on handle. */ + lock(&g_fs_lock_lock); + if (pl->type == F_RDLCK || pl->type == F_WRLCK) { + dent->maybe_has_locks = true; + } else if (!dent->maybe_has_locks) { + /* We know we're not holding any locks for the file */ + unlock(&g_fs_lock_lock); + return 0; + } + unlock(&g_fs_lock_lock); + + char* path; + ret = dentry_abs_path(dent, &path, /*size=*/NULL); + if (ret < 0) + return ret; + + ret = ipc_posix_lock_set(path, pl, wait); + free(path); + return ret; + } + + lock(&g_fs_lock_lock); + + PAL_HANDLE event = NULL; + struct posix_lock_request* req = NULL; + ret = posix_lock_set_or_add_request(dent, pl, wait, &req); + if (ret < 0) + goto out; + if (req) { + assert(wait); + + int result; + ret = DkEventCreate(&event, /*init_signaled=*/false, /*auto_clear=*/false); + if (ret < 0) + goto out; + req->event = event; + req->result = &result; + + unlock(&g_fs_lock_lock); + ret = object_wait_with_retry(event); + lock(&g_fs_lock_lock); + if (ret < 0) + goto out; + + ret = result; + } else { + ret = 0; + } +out: + unlock(&g_fs_lock_lock); + if (event) + DkObjectClose(event); + return ret; +} + +int posix_lock_set_from_ipc(const char* path, struct posix_lock* pl, bool wait, IDTYPE vmid, + unsigned long seq, bool* postponed) { + assert(!g_process_ipc_ids.leader_vmid); + + struct shim_dentry* dent = NULL; + struct posix_lock_request* req = NULL; + + int ret = path_lookupat(g_dentry_root, path, LOOKUP_NO_FOLLOW, &dent); + if (ret < 0) { + log_warning("posix_lock_set_from_ipc: error on dentry lookup for %s: %d\n", path, ret); + goto out; + } + + lock(&g_fs_lock_lock); + ret = posix_lock_set_or_add_request(dent, pl, wait, &req); + unlock(&g_fs_lock_lock); + if (ret < 0) + goto out; + + if (req) { + assert(wait); + req->vmid = vmid; + req->seq = seq; + *postponed = true; + } else { + *postponed = false; + } + ret = 0; +out: + if (dent) + put_dentry(dent); + return ret; +} + +int posix_lock_get(struct shim_dentry* dent, struct posix_lock* pl, struct posix_lock* out_pl) { + assert(pl->type != F_UNLCK); + + int ret; + if (g_process_ipc_ids.leader_vmid) { + char* path; + ret = dentry_abs_path(dent, &path, /*size=*/NULL); + if (ret < 0) + return ret; + + ret = ipc_posix_lock_get(path, pl, out_pl); + free(path); + return ret; + } + + lock(&g_fs_lock_lock); + + struct fs_lock* fs_lock = NULL; + ret = find_fs_lock(dent, /*create=*/false, &fs_lock); + if (ret < 0) + goto out; + + struct posix_lock* conflict = NULL; + if (fs_lock) + conflict = posix_lock_find(fs_lock, pl); + if (conflict) { + out_pl->type = conflict->type; + out_pl->start = conflict->start; + out_pl->end = conflict->end; + out_pl->pid = conflict->pid; + } else { + out_pl->type = F_UNLCK; + } + ret = 0; + +out: + if (fs_lock) + fs_lock_gc(fs_lock); + + unlock(&g_fs_lock_lock); + return ret; +} + +int posix_lock_get_from_ipc(const char* path, struct posix_lock* pl, struct posix_lock* out_pl) { + assert(!g_process_ipc_ids.leader_vmid); + + struct shim_dentry* dent = NULL; + int ret = path_lookupat(g_dentry_root, path, LOOKUP_NO_FOLLOW, &dent); + if (ret < 0) { + log_warning("posix_lock_get_from_ipc: error on dentry lookup for %s: %d\n", path, ret); + return ret; + } + + ret = posix_lock_get(dent, pl, out_pl); + put_dentry(dent); + return ret; +} + +int posix_lock_clear_pid(IDTYPE pid) { + if (g_process_ipc_ids.leader_vmid) { + return ipc_posix_lock_clear_pid(pid); + } + + log_debug("clearing POSIX locks for pid %d\n", pid); + + lock(&g_fs_lock_lock); + + struct fs_lock* fs_lock; + struct fs_lock* tmp; + LISTP_FOR_EACH_ENTRY_SAFE(fs_lock, tmp, &g_fs_lock_list, list) { + struct posix_lock* pl; + struct posix_lock* pl_tmp; + + bool changed = false; + LISTP_FOR_EACH_ENTRY_SAFE(pl, pl_tmp, &fs_lock->posix_locks, list) { + if (pl->pid == pid) { + LISTP_DEL(pl, &fs_lock->posix_locks, list); + free(pl); + changed = true; + } + } + + struct posix_lock_request* req; + struct posix_lock_request* req_tmp; + LISTP_FOR_EACH_ENTRY_SAFE(req, req_tmp, &fs_lock->posix_lock_requests, list) { + if (req->pl.pid == pid) { + assert(!req->event); + LISTP_DEL(req, &fs_lock->posix_lock_requests, list); + free(req); + } + } + + if (changed) { + posix_lock_process_requests(fs_lock); + fs_lock_gc(fs_lock); + } + } + + unlock(&g_fs_lock_lock); + return 0; +} diff --git a/LibOS/shim/src/ipc/shim_ipc_fs_lock.c b/LibOS/shim/src/ipc/shim_ipc_fs_lock.c new file mode 100644 index 0000000000..ed06331932 --- /dev/null +++ b/LibOS/shim/src/ipc/shim_ipc_fs_lock.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2021 Intel Corporation + * Paweł Marczewski + */ + +/* + * IPC glue code for filesystem locks. + */ + +#include "shim_fs_lock.h" +#include "shim_ipc.h" + +int ipc_posix_lock_set(const char* path, struct posix_lock* pl, bool wait) { + assert(g_process_ipc_ids.leader_vmid); + + struct shim_ipc_posix_lock msgin = { + .type = pl->type, + .start = pl->start, + .end = pl->end, + .pid = pl->pid, + + .wait = wait, + }; + + size_t path_len = strlen(path); + size_t total_msg_size = get_ipc_msg_size(sizeof(msgin) + path_len + 1); + struct shim_ipc_msg* msg = __alloca(total_msg_size); + init_ipc_msg(msg, IPC_MSG_POSIX_LOCK_SET, total_msg_size); + memcpy(msg->data, &msgin, sizeof(msgin)); + memcpy(((struct shim_ipc_posix_lock*)&msg->data)->path, path, path_len + 1); + + void* data; + int ret = ipc_send_msg_and_get_response(g_process_ipc_ids.leader_vmid, msg, &data); + if (ret < 0) + return ret; + int result = *(int*)data; + free(data); + return result; +} + +int ipc_posix_lock_set_send_response(IDTYPE vmid, unsigned long seq, int result) { + assert(!g_process_ipc_ids.leader_vmid); + + size_t total_msg_size = get_ipc_msg_size(sizeof(result)); + struct shim_ipc_msg* msg = __alloca(total_msg_size); + init_ipc_response(msg, seq, total_msg_size); + memcpy(msg->data, &result, sizeof(result)); + return ipc_send_message(vmid, msg); +} + +int ipc_posix_lock_get(const char* path, struct posix_lock* pl, struct posix_lock* out_pl) { + assert(g_process_ipc_ids.leader_vmid); + + struct shim_ipc_posix_lock msgin = { + .type = pl->type, + .start = pl->start, + .end = pl->end, + .pid = pl->pid, + }; + + size_t path_len = strlen(path); + size_t total_msg_size = get_ipc_msg_size(sizeof(msgin) + path_len + 1); + struct shim_ipc_msg* msg = __alloca(total_msg_size); + init_ipc_msg(msg, IPC_MSG_POSIX_LOCK_GET, total_msg_size); + memcpy(msg->data, &msgin, sizeof(msgin)); + memcpy(((struct shim_ipc_posix_lock*)&msg->data)->path, path, path_len + 1); + + void* data; + int ret = ipc_send_msg_and_get_response(g_process_ipc_ids.leader_vmid, msg, &data); + if (ret < 0) + return ret; + + struct shim_ipc_posix_lock_resp* resp = data; + int result = resp->result; + if (resp->result == 0) { + out_pl->type = resp->type; + out_pl->start = resp->start; + out_pl->end = resp->end; + out_pl->pid = resp->pid; + } + free(data); + return result; +} + +int ipc_posix_lock_clear_pid(IDTYPE pid) { + assert(g_process_ipc_ids.leader_vmid); + + size_t total_msg_size = get_ipc_msg_size(sizeof(pid)); + struct shim_ipc_msg* msg = __alloca(total_msg_size); + init_ipc_msg(msg, IPC_MSG_POSIX_LOCK_CLEAR_PID, total_msg_size); + memcpy(msg->data, &pid, sizeof(pid)); + + void* data; + int ret = ipc_send_msg_and_get_response(g_process_ipc_ids.leader_vmid, msg, &data); + if (ret < 0) + return ret; + int result = *(int*)data; + free(data); + return result; +} + +int ipc_posix_lock_set_callback(IDTYPE src, void* data, unsigned long seq) { + struct shim_ipc_posix_lock* msgin = data; + struct posix_lock pl = { + .type = msgin->type, + .start = msgin->start, + .end = msgin->end, + .pid = msgin->pid, + }; + + bool postponed; + int ret = posix_lock_set_from_ipc(msgin->path, &pl, msgin->wait, src, seq, &postponed); + if (ret == 0 && postponed) { + /* The response will be sent later. */ + return 0; + } + return ipc_posix_lock_set_send_response(src, seq, ret); +} + +int ipc_posix_lock_get_callback(IDTYPE src, void* data, unsigned long seq) { + struct shim_ipc_posix_lock* msgin = data; + struct posix_lock pl = { + .type = msgin->type, + .start = msgin->start, + .end = msgin->end, + .pid = msgin->pid, + }; + + struct posix_lock pl2 = {0}; + int result = posix_lock_get_from_ipc(msgin->path, &pl, &pl2); + struct shim_ipc_posix_lock_resp msgout = { + .result = result, + .type = pl2.type, + .start = pl2.start, + .end = pl2.end, + .pid = pl2.pid, + }; + + size_t total_msg_size = get_ipc_msg_size(sizeof(msgout)); + struct shim_ipc_msg* msg = __alloca(total_msg_size); + init_ipc_response(msg, seq, total_msg_size); + memcpy(msg->data, &msgout, sizeof(msgout)); + return ipc_send_message(src, msg); +} + +int ipc_posix_lock_clear_pid_callback(IDTYPE src, void* data, unsigned long seq) { + IDTYPE* pid = data; + int result = posix_lock_clear_pid(*pid); + + size_t total_msg_size = get_ipc_msg_size(sizeof(result)); + struct shim_ipc_msg* msg = __alloca(total_msg_size); + init_ipc_response(msg, seq, total_msg_size); + memcpy(msg->data, &result, sizeof(result)); + return ipc_send_message(src, msg); +} diff --git a/LibOS/shim/src/ipc/shim_ipc_worker.c b/LibOS/shim/src/ipc/shim_ipc_worker.c index 99a7b66e2f..cd30940a60 100644 --- a/LibOS/shim/src/ipc/shim_ipc_worker.c +++ b/LibOS/shim/src/ipc/shim_ipc_worker.c @@ -60,6 +60,10 @@ static ipc_callback ipc_callbacks[] = { [IPC_MSG_SYNC_CONFIRM_UPGRADE] = ipc_sync_confirm_upgrade_callback, [IPC_MSG_SYNC_CONFIRM_DOWNGRADE] = ipc_sync_confirm_downgrade_callback, [IPC_MSG_SYNC_CONFIRM_CLOSE] = ipc_sync_confirm_close_callback, + + [IPC_MSG_POSIX_LOCK_SET] = ipc_posix_lock_set_callback, + [IPC_MSG_POSIX_LOCK_GET] = ipc_posix_lock_get_callback, + [IPC_MSG_POSIX_LOCK_CLEAR_PID] = ipc_posix_lock_clear_pid_callback, }; static void ipc_leader_died_callback(void) { diff --git a/LibOS/shim/src/meson.build b/LibOS/shim/src/meson.build index 46054004f9..f94aeee176 100644 --- a/LibOS/shim/src/meson.build +++ b/LibOS/shim/src/meson.build @@ -31,6 +31,7 @@ libos_sources = files( 'fs/shim_dcache.c', 'fs/shim_fs.c', 'fs/shim_fs_hash.c', + 'fs/shim_fs_lock.c', 'fs/shim_fs_pseudo.c', 'fs/shim_namei.c', 'fs/socket/fs.c', @@ -42,6 +43,7 @@ libos_sources = files( 'fs/tmpfs/fs.c', 'ipc/shim_ipc.c', 'ipc/shim_ipc_child.c', + 'ipc/shim_ipc_fs_lock.c', 'ipc/shim_ipc_pid.c', 'ipc/shim_ipc_process_info.c', 'ipc/shim_ipc_signal.c', diff --git a/LibOS/shim/src/shim_init.c b/LibOS/shim/src/shim_init.c index a528203e95..5b17901613 100644 --- a/LibOS/shim/src/shim_init.c +++ b/LibOS/shim/src/shim_init.c @@ -17,6 +17,7 @@ #include "shim_context.h" #include "shim_defs.h" #include "shim_fs.h" +#include "shim_fs_lock.h" #include "shim_handle.h" #include "shim_internal.h" #include "shim_ipc.h" @@ -408,6 +409,7 @@ noreturn void* shim_init(int argc, void* args) { RUN_INIT(init_str_mgr); RUN_INIT(init_rlimit); RUN_INIT(init_fs); + RUN_INIT(init_fs_lock); RUN_INIT(init_dcache); RUN_INIT(init_handle); diff --git a/LibOS/shim/src/sys/shim_exit.c b/LibOS/shim/src/sys/shim_exit.c index 9a648bcbfe..28ab04c7bc 100644 --- a/LibOS/shim/src/sys/shim_exit.c +++ b/LibOS/shim/src/sys/shim_exit.c @@ -8,6 +8,7 @@ #include "pal.h" #include "pal_error.h" +#include "shim_fs_lock.h" #include "shim_handle.h" #include "shim_internal.h" #include "shim_ipc.h" @@ -94,8 +95,14 @@ noreturn void thread_exit(int error_code, int term_signal) { /* UNREACHABLE */ } + /* Clear POSIX locks before we notify parent: after a successful `wait`, our locks should be + * gone. */ + int ret = posix_lock_clear_pid(g_process.pid); + if (ret < 0) + log_warning("error clearing POSIX locks: %d\n", ret); + /* This is the last thread of the process. Let parent know we exited. */ - int ret = ipc_cld_exit_send(error_code, term_signal); + ret = ipc_cld_exit_send(error_code, term_signal); if (ret < 0) { log_error("Sending IPC process-exit notification failed: %d\n", ret); } diff --git a/LibOS/shim/src/sys/shim_fcntl.c b/LibOS/shim/src/sys/shim_fcntl.c index e637c2ae78..ed0fbcf6a4 100644 --- a/LibOS/shim/src/sys/shim_fcntl.c +++ b/LibOS/shim/src/sys/shim_fcntl.c @@ -11,8 +11,10 @@ #include "pal.h" #include "pal_error.h" #include "shim_fs.h" +#include "shim_fs_lock.h" #include "shim_handle.h" #include "shim_internal.h" +#include "shim_process.h" #include "shim_lock.h" #include "shim_table.h" #include "shim_thread.h" @@ -38,10 +40,87 @@ int set_handle_nonblocking(struct shim_handle* hdl, bool on) { return ret; } +/* + * Convert user-mode `struct flock` into our `struct posix_lock`. This mostly means converting the + * position parameters (l_whence, l_start, l_len) to an absolute inclusve range [start .. end]. See + * `man fcntl` for details. + * + * We need to return -EINVAL for underflow (positions before start of file), and -EOVERFLOW for + * positive overflow. + */ +static int flock_to_posix_lock(struct flock* fl, struct shim_handle* hdl, struct posix_lock* pl) { + if (!(fl->l_type == F_RDLCK || fl->l_type == F_WRLCK || fl->l_type == F_UNLCK)) + return -EINVAL; + + int ret; + + struct shim_fs* fs = hdl->fs; + assert(fs && fs->fs_ops); + + uint64_t origin; + switch (fl->l_whence) { + case SEEK_SET: + origin = 0; + break; + case SEEK_CUR: { + if (!fs->fs_ops->seek) + return -EINVAL; + + off_t pos = fs->fs_ops->seek(hdl, 0, SEEK_CUR); + if (pos < 0) + return pos; + origin = pos; + break; + } + case SEEK_END: { + if (!fs->fs_ops->hstat) + return -EINVAL; + + struct stat stat; + ret = fs->fs_ops->hstat(hdl, &stat); + if (ret < 0) + return ret; + assert(stat.st_size >= 0); + origin = stat.st_size; + break; + } + default: + return -EINVAL; + } + + if (__builtin_add_overflow(origin, fl->l_start, &origin)) { + return fl->l_start > 0 ? -EOVERFLOW : -EINVAL; + } + + uint64_t start, end; + if (fl->l_len > 0) { + /* l_len < 0: the range is [origin .. origin + len - 1] */ + start = origin; + if (__builtin_add_overflow(origin, fl->l_len - 1, &end)) + return -EOVERFLOW; + } else if (fl->l_len < 0) { + /* l_len < 0: the range is [origin + len .. origin - 1] */ + if (__builtin_add_overflow(origin, fl->l_len, &start)) + return -EINVAL; + if (__builtin_add_overflow(origin, -1, &end)) + return -EINVAL; + } else { + /* l_len == 0: the range is [origin .. EOF] */ + start = origin; + end = FS_LOCK_EOF; + } + + pl->type = fl->l_type; + pl->start = start; + pl->end = end; + pl->pid = g_process.pid; + return 0; +} + long shim_do_fcntl(int fd, int cmd, unsigned long arg) { struct shim_handle_map* handle_map = get_thread_handle_map(NULL); int flags; - int ret = -ENOSYS; + int ret; struct shim_handle* hdl = get_fd_handle(fd, &flags, handle_map); if (!hdl) @@ -141,21 +220,43 @@ long shim_do_fcntl(int fd, int cmd, unsigned long arg) { * l_whence, l_start, and l_len fields of lock. If a conflicting lock * is held by another process, this call returns -1 and sets errno to * EACCES or EAGAIN. - */ - case F_SETLK: - ret = -ENOSYS; - break; - - /* F_SETLKW (struct flock *) + * + * F_SETLKW (struct flock *) * As for F_SETLK, but if a conflicting lock is held on the file, * then wait for that lock to be released. If a signal is caught while * waiting, then the call is interrupted and (after the signal handler * has returned) returns immediately (with return value -1 and errno * set to EINTR; see signal(7)). */ - case F_SETLKW: - ret = -ENOSYS; + case F_SETLK: + case F_SETLKW: { + struct flock *fl = (struct flock*)arg; + if (!is_user_memory_readable(fl, sizeof(*fl))) { + ret = -EFAULT; + break; + } + + if (!hdl->dentry) { + /* TODO: Linux allows locks on pipes etc. Our locks work only for "normal" files + * that have a dentry. */ + ret = -EINVAL; + break; + } + + if (fl->l_type == F_RDLCK && !(hdl->acc_mode & MAY_READ)) + return -EINVAL; + + if (fl->l_type == F_WRLCK && !(hdl->acc_mode & MAY_WRITE)) + return -EINVAL; + + struct posix_lock pl; + ret = flock_to_posix_lock(fl, hdl, &pl); + if (ret < 0) + break; + + ret = posix_lock_set(hdl->dentry, &pl, /*wait=*/cmd == F_SETLKW); break; + } /* F_GETLK (struct flock *) * On input to this call, lock describes a lock we would like to place @@ -167,9 +268,40 @@ long shim_do_fcntl(int fd, int cmd, unsigned long arg) { * l_whence, l_start, and l_len fields of lock and sets l_pid to be * the PID of the process holding that lock. */ - case F_GETLK: - ret = -ENOSYS; + case F_GETLK: { + struct flock *fl = (struct flock*)arg; + if (!is_user_memory_readable(fl, sizeof(*fl)) + || !is_user_memory_writable(fl, sizeof(*fl))) { + ret = -EFAULT; + break; + } + + if (!hdl->dentry) + return -EINVAL; + + struct posix_lock pl; + ret = flock_to_posix_lock(fl, hdl, &pl); + if (ret < 0) + break; + + if (pl.type == F_UNLCK) + return -EINVAL; + + struct posix_lock pl2; + ret = posix_lock_get(hdl->dentry, &pl, &pl2); + if (ret < 0) + break; + + fl->l_type = pl2.type; + if (pl2.type != F_UNLCK) { + fl->l_whence = SEEK_SET; + fl->l_start = pl2.start; + fl->l_len = pl2.end - pl2.start + 1; + fl->l_pid = pl2.pid; + } + ret = 0; break; + } /* F_SETOWN (int) * Set the process ID or process group ID that will receive SIGIO @@ -183,6 +315,10 @@ long shim_do_fcntl(int fd, int cmd, unsigned long arg) { ret = 0; /* XXX: DUMMY for now */ break; + + default: + ret = -EINVAL; + break; } put_handle(hdl); diff --git a/LibOS/shim/test/ltp/ltp.cfg b/LibOS/shim/test/ltp/ltp.cfg index a2df151785..8b665eac9a 100644 --- a/LibOS/shim/test/ltp/ltp.cfg +++ b/LibOS/shim/test/ltp/ltp.cfg @@ -347,14 +347,6 @@ skip = yes [fcntl02_64] timeout = 40 -# no F_GETLK -[fcntl05] -skip = yes - -# no F_GETLK -[fcntl05_64] -skip = yes - # no LINUX_LOCK_FILE_REGION support (F_RGETLK/F_RSETLK) [fcntl06] skip = yes @@ -370,125 +362,55 @@ skip = yes [fcntl07_64] skip = yes -# no F_SETLK -[fcntl09] -skip = yes - -# no F_SETLK -[fcntl09_64] -skip = yes - -# no F_SETLKW -[fcntl10] -skip = yes - -# no F_SETLKW -[fcntl10_64] -skip = yes - -# no locking -[fcntl11] -skip = yes - -# no locking -[fcntl11_64] -skip = yes - -# 1, 2, 3, 4, 5: tries 99999, F_SETLK*, F_GETLK*, gets ENOSYS instead of EINVAL/EFAULT -[fcntl13] -must-pass = - 6 - -[fcntl13_64] -timeout = 40 -must-pass = - 6 - -# no statfs() +# test for locks, slow [fcntl14] -skip = yes +timeout = 60 [fcntl14_64] -skip = yes - -# no locks -[fcntl15] -skip = yes - -[fcntl15_64] -skip = yes - -# no statfs -[fcntl16] -skip = yes +timeout = 60 -[fcntl16_64] -skip = yes - -# no locks +# no deadlock detection [fcntl17] skip = yes [fcntl17_64] skip = yes -# no locks +# test 3 uses setreuid() [fcntl18] -skip = yes +must-pass = + 1 + 2 + 4 [fcntl18_64] -skip = yes - -# no locks -[fcntl19] -skip = yes - -[fcntl19_64] -skip = yes - -# no locks -[fcntl20] -skip = yes - -[fcntl20_64] -skip = yes - -# no locks -[fcntl21] -skip = yes - -[fcntl21_64] -skip = yes - -# no locks -[fcntl22] -skip = yes - -[fcntl22_64] -skip = yes +must-pass = + 1 + 2 + 4 -# no statfs() +# no F_SETLEASE [fcntl23] skip = yes [fcntl23_64] skip = yes -# no statfs() +# no F_SETLEASE [fcntl24] skip = yes [fcntl24_64] skip = yes -# no statfs() +# no F_SETLEASE [fcntl25] skip = yes [fcntl25_64] skip = yes -# no statfs() +# no F_SETLEASE [fcntl26] skip = yes diff --git a/LibOS/shim/test/regression/.gitignore b/LibOS/shim/test/regression/.gitignore index d99fae7830..9f635828ba 100644 --- a/LibOS/shim/test/regression/.gitignore +++ b/LibOS/shim/test/regression/.gitignore @@ -31,6 +31,7 @@ /exec_victim /exit /exit_group +/fcntl_lock /fdleak /file_check_policy /file_check_policy_allow_all_but_log diff --git a/LibOS/shim/test/regression/Makefile b/LibOS/shim/test/regression/Makefile index 7a244ba8a0..077a76e878 100644 --- a/LibOS/shim/test/regression/Makefile +++ b/LibOS/shim/test/regression/Makefile @@ -27,6 +27,7 @@ c_executables = \ exec_victim \ exit \ exit_group \ + fcntl_lock \ fdleak \ file_check_policy \ file_size \ diff --git a/LibOS/shim/test/regression/fcntl_lock.c b/LibOS/shim/test/regression/fcntl_lock.c new file mode 100644 index 0000000000..c6f501f4fb --- /dev/null +++ b/LibOS/shim/test/regression/fcntl_lock.c @@ -0,0 +1,353 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2021 Intel Corporation + * Paweł Marczewski + */ + +/* + * Test for POSIX locks (`fcntl(F_SETLK/F_SETLKW/F_GETLK`). We assert that the calls succeed (or + * taking a lock fails), and log all details for debugging purposes. + * + * The tests usually start another process, and coordinate with it using pipes. + */ +#define _POSIX_C_SOURCE 1 /* fileno() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TEST_DIR "tmp/" +#define TEST_FILE "tmp/lock_file" + +static int g_fd; + +static const char* str_cmd(int cmd) { + switch (cmd) { + case F_SETLK: return "F_SETLK"; + case F_SETLKW: return "F_SETLKW"; + case F_GETLK: return "F_GETLK"; + default: return "???"; + } +} + +static const char* str_type(int type) { + switch (type) { + case F_RDLCK: return "F_RDLCK"; + case F_WRLCK: return "F_WRLCK"; + case F_UNLCK: return "F_UNLCK"; + default: return "???"; + } +} + +static const char* str_whence(int whence) { + switch (whence) { + case SEEK_SET: return "SEEK_SET"; + case SEEK_CUR: return "SEEK_CUR"; + case SEEK_END: return "SEEK_END"; + default: return "???"; + } +} + +static const char* str_err(int err) { + switch (err) { + case EACCES: return "EACCES"; + case EAGAIN: return "EAGAIN"; + default: return "???"; + } +} + +/* Run fcntl command and log it, along with the result. Returns true if it suceeds (F_SETLK returns + * success, F_GETLK returns no conflicting lock). */ +static bool try_lock(int cmd, int type, int whence, long int start, long int len) { + assert(cmd == F_SETLK || cmd == F_SETLKW || cmd == F_GETLK); + assert(type == F_RDLCK || type == F_WRLCK || type == F_UNLCK); + assert(whence == SEEK_SET || whence == SEEK_CUR || whence == SEEK_END); + + struct flock fl = { + .l_type = type, + .l_whence = whence, + .l_start = start, + .l_len = len, + }; + int ret = fcntl(g_fd, cmd, &fl); + if (ret == -1 && errno != EACCES && errno != EAGAIN) + err(1, "fcntl"); + + fprintf(stderr, "%d: fcntl(fd, %s, {%s, %s, %4ld, %4ld}) = %s", getpid(), str_cmd(cmd), + str_type(type), str_whence(whence), start, len, ret == 0 ? "0" : str_err(errno)); + if (ret == 0 && cmd == F_GETLK) { + if (fl.l_type == F_UNLCK) { + fprintf(stderr, "; {%s}\n", str_type(fl.l_type)); + } else { + fprintf(stderr, "; {%s, %s, %4ld, %4ld, %d}\n", str_type(fl.l_type), + str_whence(fl.l_whence), fl.l_start, fl.l_len, fl.l_pid); + } + } else { + fprintf(stderr, "\n"); + } + + fflush(stderr); + + if (cmd == F_GETLK) { + return fl.l_type == F_UNLCK; + } else { + return ret == 0; + } +} + +static void unlock(long int start, long int len) { + if (!try_lock(F_SETLK, F_UNLCK, SEEK_SET, start, len)) + errx(1, "untry_lock failed"); +} + +static void lock_ok(int type, long int start, long int len) { + assert(type == F_RDLCK || type == F_WRLCK); + + if (!try_lock(F_GETLK, type, SEEK_SET, start, len) + || !try_lock(F_SETLK, type, SEEK_SET, start, len)) + errx(1, "setting %s failed", str_type(type)); +} + +static void lock_wait_ok(int type, long int start, long int len) { + if (!try_lock(F_SETLKW, type, SEEK_SET, start, len)) + errx(1, "waiting for %s failed", str_type(type)); +} + +static void lock_fail(int type, long int start, long int len) { + if (try_lock(F_GETLK, type, SEEK_SET, start, len) + || try_lock(F_SETLK, type, SEEK_SET, start, len)) + errx(1, "setting %s succeeded unexpectedly", str_type(type)); +} + +/* + * Test: lock/unlock various ranges. The locks are all for the same process, so the test is unlikely + * to fail, but it's useful for checking if the locks are replaced and merged correctly (by looking + * at Graphene debug output). + */ +static void test_ranges() { + printf("test ranges...\n"); + unlock(0, 0); + + /* Lock some ranges, check joining adjacent ranges */ + lock_ok(F_RDLCK, 10, 10); + lock_ok(F_RDLCK, 30, 10); + lock_ok(F_RDLCK, 20, 10); + lock_ok(F_RDLCK, 1000, 0); + + /* Unlock some ranges, check subtracting and splitting ranges */ + unlock(5, 10); + unlock(20, 5); + unlock(35, 10); + unlock(950, 100); + + /* Overwrite with write lock */ + lock_ok(F_WRLCK, 0, 30); + lock_ok(F_WRLCK, 30, 30); +} + +static void wait_for_child(void) { + int ret; + do { + ret = wait(NULL); + } while (ret == -1 && errno == EINTR); + if (ret == -1) + err(1, "wait"); +} + +static void open_pipes(int pipes[2][2]) { + for (unsigned int i = 0; i < 2; i++) { + if (pipe(pipes[i]) < 0) + err(1, "pipe"); + } +} + +static void close_pipes(int pipes[2][2]) { + for (unsigned int i = 0; i < 2; i++) { + for (unsigned int j = 0; j < 2; j++) { + if (close(pipes[i][j]) < 0) + err(1, "close pipe"); + } + } +} + +static void write_pipe(int pipe[2]) { + char c = 0; + int ret; + do { + ret = write(pipe[1], &c, sizeof(c)); + } while (ret == -1 && errno == EINTR); + if (ret == -1) + err(1, "write"); +} + +static void read_pipe(int pipe[2]) { + char c; + int ret; + do { + ret = read(pipe[0], &c, sizeof(c)); + } while (ret == -1 && errno == EINTR); + if (ret == -1) + err(1, "write"); + if (ret == 0) + errx(1, "pipe closed"); +} + +/* Test: child takes a lock and then exits. The lock should be released. */ +static void test_child_exit() { + printf("test child exit...\n"); + unlock(0, 0); + + int pipes[2][2]; + open_pipes(pipes); + + pid_t pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + lock_ok(F_WRLCK, 0, 100); + write_pipe(pipes[0]); + read_pipe(pipes[1]); + exit(0); + } + + read_pipe(pipes[0]); + lock_fail(F_RDLCK, 0, 100); + write_pipe(pipes[1]); + lock_wait_ok(F_RDLCK, 0, 100); + + wait_for_child(); + close_pipes(pipes); +} + +/* Test: child takes a lock, and then closes a duplicated FD. The lock should be released. */ +static void test_file_close() { + printf("test file close...\n"); + unlock(0, 0); + + int pipes[2][2]; + open_pipes(pipes); + + pid_t pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + lock_ok(F_WRLCK, 0, 100); + write_pipe(pipes[0]); + read_pipe(pipes[1]); + + int fd2 = dup(g_fd); + if (fd2 < 0) + err(1, "fopen"); + + if (close(fd2) < 0) + err(1, "close"); + + read_pipe(pipes[1]); + exit(0); + } + + read_pipe(pipes[0]); + lock_fail(F_RDLCK, 0, 100); + write_pipe(pipes[1]); + lock_wait_ok(F_RDLCK, 0, 100); + write_pipe(pipes[1]); + + wait_for_child(); + close_pipes(pipes); +} + +/* Test: child waits for parent to release a lock. */ +static void test_child_wait() { + printf("test child wait...\n"); + unlock(0, 0); + + int pipes[2][2]; + open_pipes(pipes); + + lock_ok(F_RDLCK, 0, 100); + + pid_t pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + lock_ok(F_RDLCK, 0, 100); + lock_fail(F_WRLCK, 0, 100); + write_pipe(pipes[0]); + lock_wait_ok(F_WRLCK, 0, 100); + exit(0); + } + + read_pipe(pipes[0]); + unlock(0, 100); + + wait_for_child(); + close_pipes(pipes); +} + +/* Test: parent waits for child to release a lock. */ +static void test_parent_wait() { + printf("test parent wait...\n"); + unlock(0, 0); + + int pipes[2][2]; + open_pipes(pipes); + + pid_t pid = fork(); + if (pid < 0) + err(1, "fork"); + + if (pid == 0) { + lock_ok(F_RDLCK, 0, 100); + write_pipe(pipes[0]); + read_pipe(pipes[1]); + unlock(0, 100); + read_pipe(pipes[1]); + exit(0); + } + + /* parent process: */ + + read_pipe(pipes[0]); + + /* read lock should succeed */ + lock_ok(F_RDLCK, 0, 100); + lock_fail(F_WRLCK, 0, 100); + write_pipe(pipes[1]); + lock_wait_ok(F_WRLCK, 0, 100); + write_pipe(pipes[1]); + + wait_for_child(); + close_pipes(pipes); +} + +int main(void) { + setbuf(stdout, NULL); + + FILE* fp = fopen(TEST_FILE, "w+"); + if (!fp) + err(1, "fopen"); + + g_fd = fileno(fp); + + test_ranges(); + test_child_exit(); + test_file_close(); + test_child_wait(); + test_parent_wait(); + + if (fclose(fp) == EOF) + err(1, "fclose"); + + if (unlink(TEST_FILE) < 0) + err(1, "unlink"); + + printf("TEST OK\n"); + return 0; +} diff --git a/LibOS/shim/test/regression/test_libos.py b/LibOS/shim/test/regression/test_libos.py index b317ddfbeb..08b2b0e189 100644 --- a/LibOS/shim/test/regression/test_libos.py +++ b/LibOS/shim/test/regression/test_libos.py @@ -582,6 +582,10 @@ def test_103_gettimeofday(self): stdout, _ = self.run_binary(['gettimeofday']) self.assertIn('TEST OK', stdout) + def test_110_fcntl_lock(self): + stdout, _ = self.run_binary(['fcntl_lock']) + self.assertIn('TEST OK', stdout) + class TC_31_Syscall(RegressionTestCase): @unittest.skipUnless(HAS_SGX, 'This test is only meaningful on SGX PAL because only SGX catches raw '