Skip to content

Commit a4beac7

Browse files
PSRCodejgalar
authored andcommitted
Fix: consumer: snapshot: assertion on subsequent snapshot
Observed issue ============== While a snapshot is being taken, the containing folder can disappear unexpectedly. This can lead to the following errors, which are expected and mostly handled fine: PERROR - 14:47:32.002564464 [2922498/2922507]: Failed to open file relative to trace chunk file_path = "channel0_0", flags = 577, mode = 432: No such file or directory (in _lttng_trace_chunk_open_fs_handle_locked() at trace-chunk.cpp:1411) Error: Failed to open stream file "channel0_0" Error: Snapshot channel failed The problem happens on the subsequent snapshot for the session: #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 #1 0x00007fbbdadb3859 in __GI_abort () at abort.c:79 #2 0x00007fbbdadb3729 in __assert_fail_base (fmt=0x7fbbdaf49588 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55c4212cfbb5 "!stream->trace_chunk", file=0x55c4212cf820 "kernel-co #3 0x00007fbbdadc5006 in __GI___assert_fail (assertion=0x55c4212cfbb5 "!stream->trace_chunk", file=0x55c4212cf820 "kernel-consumer/kernel-consumer.cpp", line=188, function=0x55c4212cfb00 " #4 0x000055c421268cc6 in lttng_kconsumer_snapshot_channel (channel=0x7fbbc4000b60, key=1, path=0x7fbbd37f8fd4 "", relayd_id=18446744073709551615, nb_packets_per_stream=0) at kernel-consume #5 0x000055c42126b39d in lttng_kconsumer_recv_cmd (ctx=0x55c421b80a90, sock=31, consumer_sockpoll=0x7fbbd37fd280) at kernel-consumer/kernel-consumer.cpp:986 #6 0x000055c4212546d1 in lttng_consumer_recv_cmd (ctx=0x55c421b80a90, sock=31, consumer_sockpoll=0x7fbbd37fd280) at consumer/consumer.cpp:2090 #7 0x000055c421259963 in consumer_thread_sessiond_poll (data=0x55c421b80a90) at consumer/consumer.cpp:3281 #8 0x00007fbbdaf8b609 in start_thread (arg=<optimized out>) at pthread_create.c:477 lttng#9 0x00007fbbdaeb0163 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 How to reproduce: 1. Setting a breakpoint on snapshot_channel() inside src/common/ust-consumer/ust-consumer.cpp 2. When the breakpoint hits, remove the the complete lttng directory containing the session data. 3. Continue the lttng_consumerd process from gdb. 4. In that case you see a negative return value -1 from consumer_stream_create_output_files() inside snapshot_channel(). 5. Take another snapshot and lttng_consumerd crashes because of the `assert(!stream->trace_chunk)` in snapshot_channel(). This last action does not require any breakpoint intervention. Cause ===== During the snapshot, the stream is assigned the channel current chunk. It is expected that the stream does not have a chunk at this point. The error handling is faulty here, the stream chunk must be invalidated/reset on error to allow its reuse later on. The problem exists for both consumer domains (user/kernel). Solution ======== For the ust consumer, we can directly use the `error_close_stream` label. For the kernel consumer, the code path is slightly different since it does not uses `consumer_stream_close`. Note that `consumer_stream_close` cannot be used as is for the kernel consumer. The current implementation partially resembles `consumer_stream_close` at the end of the iteration. It is extracted to its own function for easier reuse from the new `error_finalize_stream` label. Known drawbacks ========= None. Fixes: #1352 Signed-off-by: Marcel Hamer <[email protected]> Signed-off-by: Jonathan Rajotte <[email protected]> Signed-off-by: Jérémie Galarneau <[email protected]> Change-Id: I9fc81917b19aa436ed8e8679672648f2d5baf41a
1 parent 6e5438d commit a4beac7

File tree

2 files changed

+40
-28
lines changed

2 files changed

+40
-28
lines changed

src/common/kernel-consumer/kernel-consumer.cpp

+35-23
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,30 @@ int get_current_subbuf_addr(struct lttng_consumer_stream *stream,
135135
return ret;
136136
}
137137

138+
static void finalize_snapshot_stream(
139+
struct lttng_consumer_stream *stream, uint64_t relayd_id)
140+
{
141+
ASSERT_LOCKED(stream->lock);
142+
143+
if (relayd_id == (uint64_t) -1ULL) {
144+
if (stream->out_fd >= 0) {
145+
const int ret = close(stream->out_fd);
146+
147+
if (ret < 0) {
148+
PERROR("Failed to close stream snapshot output file descriptor");
149+
}
150+
151+
stream->out_fd = -1;
152+
}
153+
} else {
154+
close_relayd_stream(stream);
155+
stream->net_seq_idx = (uint64_t) -1ULL;
156+
}
157+
158+
lttng_trace_chunk_put(stream->trace_chunk);
159+
stream->trace_chunk = NULL;
160+
}
161+
138162
/*
139163
* Take a snapshot of all the stream of a channel
140164
* RCU read-side lock must be held across this function to ensure existence of
@@ -198,13 +222,13 @@ static int lttng_kconsumer_snapshot_channel(
198222
ret = consumer_send_relayd_stream(stream, path);
199223
if (ret < 0) {
200224
ERR("sending stream to relayd");
201-
goto end_unlock;
225+
goto error_finalize_stream;
202226
}
203227
} else {
204228
ret = consumer_stream_create_output_files(stream,
205229
false);
206230
if (ret < 0) {
207-
goto end_unlock;
231+
goto error_finalize_stream;
208232
}
209233
DBG("Kernel consumer snapshot stream (%" PRIu64 ")",
210234
stream->key);
@@ -222,27 +246,27 @@ static int lttng_kconsumer_snapshot_channel(
222246
ret = kernctl_buffer_flush(stream->wait_fd);
223247
if (ret < 0) {
224248
ERR("Failed to flush kernel stream");
225-
goto end_unlock;
249+
goto error_finalize_stream;
226250
}
227251
goto end_unlock;
228252
}
229253

230254
ret = lttng_kconsumer_take_snapshot(stream);
231255
if (ret < 0) {
232256
ERR("Taking kernel snapshot");
233-
goto end_unlock;
257+
goto error_finalize_stream;
234258
}
235259

236260
ret = lttng_kconsumer_get_produced_snapshot(stream, &produced_pos);
237261
if (ret < 0) {
238262
ERR("Produced kernel snapshot position");
239-
goto end_unlock;
263+
goto error_finalize_stream;
240264
}
241265

242266
ret = lttng_kconsumer_get_consumed_snapshot(stream, &consumed_pos);
243267
if (ret < 0) {
244268
ERR("Consumerd kernel snapshot position");
245-
goto end_unlock;
269+
goto error_finalize_stream;
246270
}
247271

248272
consumed_pos = consumer_get_consume_start_pos(consumed_pos,
@@ -262,7 +286,7 @@ static int lttng_kconsumer_snapshot_channel(
262286
if (ret < 0) {
263287
if (ret != -EAGAIN) {
264288
PERROR("kernctl_get_subbuf snapshot");
265-
goto end_unlock;
289+
goto error_finalize_stream;
266290
}
267291
DBG("Kernel consumer get subbuf failed. Skipping it.");
268292
consumed_pos += stream->max_sb_size;
@@ -312,26 +336,12 @@ static int lttng_kconsumer_snapshot_channel(
312336
ret = kernctl_put_subbuf(stream->wait_fd);
313337
if (ret < 0) {
314338
ERR("Snapshot kernctl_put_subbuf");
315-
goto end_unlock;
339+
goto error_finalize_stream;
316340
}
317341
consumed_pos += stream->max_sb_size;
318342
}
319343

320-
if (relayd_id == (uint64_t) -1ULL) {
321-
if (stream->out_fd >= 0) {
322-
ret = close(stream->out_fd);
323-
if (ret < 0) {
324-
PERROR("Kernel consumer snapshot close out_fd");
325-
goto end_unlock;
326-
}
327-
stream->out_fd = -1;
328-
}
329-
} else {
330-
close_relayd_stream(stream);
331-
stream->net_seq_idx = (uint64_t) -1ULL;
332-
}
333-
lttng_trace_chunk_put(stream->trace_chunk);
334-
stream->trace_chunk = NULL;
344+
finalize_snapshot_stream(stream, relayd_id);
335345
pthread_mutex_unlock(&stream->lock);
336346
}
337347

@@ -344,6 +354,8 @@ static int lttng_kconsumer_snapshot_channel(
344354
if (ret < 0) {
345355
ERR("Snapshot kernctl_put_subbuf error path");
346356
}
357+
error_finalize_stream:
358+
finalize_snapshot_stream(stream, relayd_id);
347359
end_unlock:
348360
pthread_mutex_unlock(&stream->lock);
349361
end:

src/common/ust-consumer/ust-consumer.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -1089,13 +1089,13 @@ static int snapshot_channel(struct lttng_consumer_channel *channel,
10891089
if (use_relayd) {
10901090
ret = consumer_send_relayd_stream(stream, path);
10911091
if (ret < 0) {
1092-
goto error_unlock;
1092+
goto error_close_stream;
10931093
}
10941094
} else {
10951095
ret = consumer_stream_create_output_files(stream,
10961096
false);
10971097
if (ret < 0) {
1098-
goto error_unlock;
1098+
goto error_close_stream;
10991099
}
11001100
DBG("UST consumer snapshot stream (%" PRIu64 ")",
11011101
stream->key);
@@ -1117,19 +1117,19 @@ static int snapshot_channel(struct lttng_consumer_channel *channel,
11171117
ret = lttng_ustconsumer_take_snapshot(stream);
11181118
if (ret < 0) {
11191119
ERR("Taking UST snapshot");
1120-
goto error_unlock;
1120+
goto error_close_stream;
11211121
}
11221122

11231123
ret = lttng_ustconsumer_get_produced_snapshot(stream, &produced_pos);
11241124
if (ret < 0) {
11251125
ERR("Produced UST snapshot position");
1126-
goto error_unlock;
1126+
goto error_close_stream;
11271127
}
11281128

11291129
ret = lttng_ustconsumer_get_consumed_snapshot(stream, &consumed_pos);
11301130
if (ret < 0) {
11311131
ERR("Consumerd UST snapshot position");
1132-
goto error_unlock;
1132+
goto error_close_stream;
11331133
}
11341134

11351135
/*

0 commit comments

Comments
 (0)