Skip to content

Commit a01c682

Browse files
PSRCodejgalar
authored andcommitted
Fix: run-as thread deadlock on itself in restart error path
The deadlock was found using this backtrace Thread 5: 0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 1 0x00007efc6b650023 in __GI___pthread_mutex_lock (mutex=mutex@entry=0x55fc37128400 <worker_lock>) at ../nptl/pthread_mutex_lock.c:78 2 0x000055fc36efbe05 in run_as_destroy_worker () at runas.c:1233 3 0x000055fc36efc2e7 in run_as_restart_worker (worker=<optimized out>) at runas.c:998 4 run_as (cmd=cmd@entry=RUN_AS_UNLINK, data=data@entry=0x7efc5b7fa630, ret_value=ret_value@entry=0x7efc5b7fa510, uid=uid@entry=1000, gid=gid@entry=1000) at runas.c:1033 5 0x000055fc36efc9ce in run_as_unlink (path=path@entry=0x7efc5b7fb690 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index/channel0_3.idx", uid=uid@entry=1000, gid=gid@entry=1000) at runas.c :1120 6 0x000055fc36ef7feb in utils_unlink_stream_file (path_name=path_name@entry=0x7efc5b7fc7e0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index", file_name=file_name@entry=0x7efc500085d4 "channel0_3", size=size@entry=0, count=count@entry=0, uid=uid@entry=1000, gid=gid@entry=1000, suffix=0x55fc36f19b26 ".idx") at utils.c:929 7 0x000055fc36f01d4e in lttng_index_file_create (path_name=path_name@entry=0x7efc500087a0 "/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel", stream_name=stream_name@entry=0x7efc500085d4 "channel0_3", uid=1000, gid=1000, size=0, count=0, major=1, minor=1) at index.c:79 8 0x000055fc36ed9475 in rotate_local_stream (ctx=<optimized out>, stream=0x7efc50008460) at consumer.c:4105 9 0x000055fc36ed98b5 in lttng_consumer_rotate_stream (ctx=ctx@entry=0x55fc37428d80, stream=stream@entry=0x7efc50008460, rotated=rotated@entry=0x7efc5b7fdb27) at consumer.c:4181 10 0x000055fc36ee354e in lttng_kconsumer_read_subbuffer (stream=stream@entry=0x7efc50008460, ctx=ctx@entry=0x55fc37428d80, rotated=rotated@entry=0x7efc5b7fdb27) at kernel-consumer.c:1740 11 0x000055fc36ed7a30 in lttng_consumer_read_subbuffer (stream=0x7efc50008460, ctx=0x55fc37428d80) at consumer.c:3383 12 0x000055fc36ed4b74 in consumer_thread_data_poll (data=0x55fc37428d80) at consumer.c:2751 13 0x00007efc6b64d6db in start_thread (arg=0x7efc5b7fe700) at pthread_create.c:463 14 0x00007efc6af6488f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 The owner of the lock is itself: print worker_lock.__data.__owner $2 = 25725 thread find 25725 Thread 5 has target id 'Thread 0x7efc5b7fe700 (LWP 25725)' The worker_lock is first taken in frame #4: run_as runas.c:1033 pthread_mutex_lock(&worker_lock); if (use_clone()) { ... /* * If the worker thread crashed the errno is set to EIO. we log * the error and start a new worker process. */ if (ret == -1 && saved_errno == EIO) { DBG("Socket closed unexpectedly... " "Restarting the worker process"); -> ret = run_as_restart_worker(global_worker); if (ret == -1) { ERR("Failed to restart worker process."); goto err; } Solution ======== Create run_as_restart_worker_no_lock which does not to take the lock on execution. Use run_as_restart_worker_no_lock at the run_as error path call site. Use run_as_restart_worker_no_lock inside run_as_restart_worker while holding the worker lock to provide identical behaviour to other call sites. Signed-off-by: Jonathan Rajotte <[email protected]> Signed-off-by: Jérémie Galarneau <[email protected]>
1 parent 5b8139c commit a01c682

File tree

1 file changed

+47
-42
lines changed

1 file changed

+47
-42
lines changed

src/common/runas.c

+47-42
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,51 @@ int run_as_create_worker_no_lock(const char *procname,
994994
return ret;
995995
}
996996

997+
static
998+
void run_as_destroy_worker_no_lock(void)
999+
{
1000+
struct run_as_worker *worker = global_worker;
1001+
1002+
DBG("Destroying run_as worker");
1003+
if (!worker) {
1004+
return;
1005+
}
1006+
/* Close unix socket */
1007+
DBG("Closing run_as worker socket");
1008+
if (lttcomm_close_unix_sock(worker->sockpair[0])) {
1009+
PERROR("close");
1010+
}
1011+
worker->sockpair[0] = -1;
1012+
/* Wait for worker. */
1013+
for (;;) {
1014+
int status;
1015+
pid_t wait_ret;
1016+
1017+
wait_ret = waitpid(worker->pid, &status, 0);
1018+
if (wait_ret < 0) {
1019+
if (errno == EINTR) {
1020+
continue;
1021+
}
1022+
PERROR("waitpid");
1023+
break;
1024+
}
1025+
1026+
if (WIFEXITED(status)) {
1027+
LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
1028+
DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
1029+
WEXITSTATUS(status));
1030+
break;
1031+
} else if (WIFSIGNALED(status)) {
1032+
ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
1033+
WTERMSIG(status));
1034+
break;
1035+
}
1036+
}
1037+
free(worker->procname);
1038+
free(worker);
1039+
global_worker = NULL;
1040+
}
1041+
9971042
static
9981043
int run_as_restart_worker(struct run_as_worker *worker)
9991044
{
@@ -1003,7 +1048,7 @@ int run_as_restart_worker(struct run_as_worker *worker)
10031048
procname = worker->procname;
10041049

10051050
/* Close socket to run_as worker process and clean up the zombie process */
1006-
run_as_destroy_worker();
1051+
run_as_destroy_worker_no_lock();
10071052

10081053
/* Create a new run_as worker process*/
10091054
ret = run_as_create_worker_no_lock(procname, NULL, NULL);
@@ -1238,47 +1283,7 @@ int run_as_create_worker(const char *procname,
12381283
LTTNG_HIDDEN
12391284
void run_as_destroy_worker(void)
12401285
{
1241-
struct run_as_worker *worker = global_worker;
1242-
1243-
DBG("Destroying run_as worker");
12441286
pthread_mutex_lock(&worker_lock);
1245-
if (!worker) {
1246-
goto end;
1247-
}
1248-
/* Close unix socket */
1249-
DBG("Closing run_as worker socket");
1250-
if (lttcomm_close_unix_sock(worker->sockpair[0])) {
1251-
PERROR("close");
1252-
}
1253-
worker->sockpair[0] = -1;
1254-
/* Wait for worker. */
1255-
for (;;) {
1256-
int status;
1257-
pid_t wait_ret;
1258-
1259-
wait_ret = waitpid(worker->pid, &status, 0);
1260-
if (wait_ret < 0) {
1261-
if (errno == EINTR) {
1262-
continue;
1263-
}
1264-
PERROR("waitpid");
1265-
break;
1266-
}
1267-
1268-
if (WIFEXITED(status)) {
1269-
LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
1270-
DEFAULT_RUN_AS_WORKER_NAME " terminated with status code %d",
1271-
WEXITSTATUS(status));
1272-
break;
1273-
} else if (WIFSIGNALED(status)) {
1274-
ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal %d",
1275-
WTERMSIG(status));
1276-
break;
1277-
}
1278-
}
1279-
free(worker->procname);
1280-
free(worker);
1281-
global_worker = NULL;
1282-
end:
1287+
run_as_destroy_worker_no_lock();
12831288
pthread_mutex_unlock(&worker_lock);
12841289
}

0 commit comments

Comments
 (0)