Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/fluent-bit/flb_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,8 @@ struct flb_config {
int shutdown_by_hot_reloading;
int hot_reloading;
int hot_reload_succeeded;

int hot_reload_watchdog_timeout_seconds;

/* Routing */
size_t route_mask_size;
Expand Down Expand Up @@ -381,6 +383,7 @@ enum conf_type {

#define FLB_CONF_STR_HOT_RELOAD "Hot_Reload"
#define FLB_CONF_STR_HOT_RELOAD_ENSURE_THREAD_SAFETY "Hot_Reload.Ensure_Thread_Safety"
#define FLB_CONF_STR_HOT_RELOAD_TIMEOUT "Hot_Reload.Timeout"

/* Set up maxstdio (Windows) */
#define FLB_CONF_STR_WINDOWS_MAX_STDIO "windows.maxstdio"
Expand Down
14 changes: 14 additions & 0 deletions plugins/in_dummy/in_dummy.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <fluent-bit/flb_time.h>
#include <fluent-bit/flb_pack.h>
#include <fluent-bit/flb_log_event.h>
#include <fluent-bit/flb_utils.h>

#include "in_dummy.h"

Expand Down Expand Up @@ -332,6 +333,7 @@ static int in_dummy_init(struct flb_input_instance *in,
ctx->ins = in;
ctx->samples = 0;
ctx->samples_count = 0;
ctx->test_hang_on_exit = FLB_FALSE;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

happy to wrap all this in #ifdef FLB_TESTS_INTERNAL, just want to make sure that's what we want first


/* Initialize head config */
ret = configure(ctx, in, &tm);
Expand Down Expand Up @@ -391,6 +393,13 @@ static int in_dummy_exit(void *data, struct flb_config *config)
(void) *config;
struct flb_dummy *ctx = data;

/* Test-only hang used by watchdog tests */
if (ctx->test_hang_on_exit) {
flb_plg_debug(ctx->ins, "TEST: Simulating hang for hot reload watchdog test");
/* 1000 seconds */
flb_time_msleep(1000 * 1000);
}

config_destroy(ctx);

return 0;
Expand Down Expand Up @@ -453,6 +462,11 @@ static struct flb_config_map config_map[] = {
0, FLB_TRUE, offsetof(struct flb_dummy, flush_on_startup),
"generate the first event on startup"
},
{
FLB_CONFIG_MAP_BOOL, "test_hang_on_exit", "false",
0, FLB_TRUE, offsetof(struct flb_dummy, test_hang_on_exit),
"TEST ONLY: simulate hang during exit to test hot reload watchdog"
},
{0}
};

Expand Down
1 change: 1 addition & 0 deletions plugins/in_dummy/in_dummy.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ struct flb_dummy {

int fixed_timestamp;
int flush_on_startup;
int test_hang_on_exit; /* TEST ONLY: Used for hot reload watchdog testing */

char *ref_metadata_msgpack;
size_t ref_metadata_msgpack_size;
Expand Down
5 changes: 5 additions & 0 deletions src/flb_config.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ struct flb_service_config service_configs[] = {
FLB_CONF_TYPE_BOOL,
offsetof(struct flb_config, ensure_thread_safety_on_hot_reloading)},

{FLB_CONF_STR_HOT_RELOAD_TIMEOUT,
FLB_CONF_TYPE_INT,
offsetof(struct flb_config, hot_reload_watchdog_timeout_seconds)},

{NULL, FLB_CONF_TYPE_OTHER, 0} /* end of array */
};

Expand Down Expand Up @@ -308,6 +312,7 @@ struct flb_config *flb_config_init()
config->shutdown_by_hot_reloading = FLB_FALSE;
config->hot_reloading = FLB_FALSE;
config->hot_reload_succeeded = FLB_FALSE;
config->hot_reload_watchdog_timeout_seconds = 0;

#ifdef FLB_SYSTEM_WINDOWS
config->win_maxstdio = 512;
Expand Down
84 changes: 78 additions & 6 deletions src/flb_reload.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,16 @@
#include <fluent-bit/flb_utils.h>
#include <fluent-bit/flb_plugin.h>
#include <fluent-bit/flb_reload.h>
#include <fluent-bit/flb_time.h>

#include <cfl/cfl.h>
#include <cfl/cfl_sds.h>
#include <cfl/cfl_variant.h>
#include <cfl/cfl_kvlist.h>

#include <fluent-bit/flb_pthread.h>
#include <stdlib.h>

static int flb_input_propery_check_all(struct flb_config *config)
{
int ret;
Expand Down Expand Up @@ -376,6 +380,66 @@ static int flb_reload_reinstantiate_external_plugins(struct flb_config *src, str
return 0;
}

struct flb_reload_watchdog_ctx {
pthread_t tid;
int timeout_seconds;
};

static void *hot_reload_watchdog_thread(void *arg)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function should only called if the reload watchdog timer is enabled manually, the user must be aware about an explicit abort() in case Fluent Bit is being used in library mode and can generate other side effects

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense. I'm structured the code so that we return early without starting the thread if the feature is disabled and disabled it by default.

{
struct flb_reload_watchdog_ctx *ctx = (struct flb_reload_watchdog_ctx *)arg;

/* Set async cancellation type for (mostly) immediate response to pthread_cancel */
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

flb_time_msleep(ctx->timeout_seconds * 1000);

flb_error("[hot_reload_watchdog] Hot reload timeout exceeded (%d seconds), "
"aborting to prevent indefinite hang", ctx->timeout_seconds);
abort();
}

static struct flb_reload_watchdog_ctx *flb_reload_watchdog_start(struct flb_config *config)
{
struct flb_reload_watchdog_ctx *watchdog_ctx;
int ret;

if (config->hot_reload_watchdog_timeout_seconds <= 0) {
flb_debug("[reload] Hot reload watchdog disabled");
return NULL;
}

watchdog_ctx = flb_malloc(sizeof(struct flb_reload_watchdog_ctx));
if (!watchdog_ctx) {
flb_errno();
return NULL;
}
watchdog_ctx->timeout_seconds = config->hot_reload_watchdog_timeout_seconds;

ret = pthread_create(&watchdog_ctx->tid, NULL, hot_reload_watchdog_thread, watchdog_ctx);
if (ret != 0) {
flb_error("[reload] Failed to create hot reload watchdog thread: %d", ret);
flb_free(watchdog_ctx);
return NULL;
}

flb_debug("[reload] Hot reload watchdog thread started");
return watchdog_ctx;
}

static void flb_reload_watchdog_cleanup(struct flb_reload_watchdog_ctx *watchdog_ctx)
{
if (!watchdog_ctx) {
return;
}

pthread_cancel(watchdog_ctx->tid);
pthread_join(watchdog_ctx->tid, NULL);
Comment on lines +436 to +437
Copy link
Contributor Author

@stoksc stoksc Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i didn't think any more sophisticated synchronization (around the abort call) was worth the complexity cost but open to opinions (or knowledge i dont have, not great at c)

flb_debug("[reload] Hot reload watchdog thread cancelled");

flb_free(watchdog_ctx);
}

int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
{
int ret;
Expand All @@ -387,6 +451,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
struct flb_cf *original_cf;
int verbose;
int reloaded_count = 0;
struct flb_reload_watchdog_ctx *watchdog_ctx = NULL;

if (ctx == NULL) {
flb_error("[reload] given flb context is NULL");
Expand Down Expand Up @@ -417,6 +482,9 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
(long unsigned int) getpid(),
(void *) pthread_self());

/* Start the watchdog thread */
watchdog_ctx = flb_reload_watchdog_start(old_config);

if (old_config->conf_path_file) {
file = flb_sds_create(old_config->conf_path_file);
}
Expand All @@ -427,6 +495,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
}
flb_cf_destroy(new_cf);
flb_error("[reload] reconstruct cf failed");
flb_reload_watchdog_cleanup(watchdog_ctx);
return FLB_RELOAD_HALTED;
}
}
Expand All @@ -439,7 +508,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
}
flb_cf_destroy(new_cf);
flb_error("[reload] creating flb context is failed. Reloading is halted");

flb_reload_watchdog_cleanup(watchdog_ctx);
return FLB_RELOAD_HALTED;
}

Expand Down Expand Up @@ -469,7 +538,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
if (!new_cf) {
flb_sds_destroy(file);
old_config->hot_reloading = FLB_FALSE;

flb_reload_watchdog_cleanup(watchdog_ctx);
return FLB_RELOAD_HALTED;
}
}
Expand All @@ -485,7 +554,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
flb_destroy(new_ctx);
old_config->hot_reloading = FLB_FALSE;
flb_error("[reload] reloaded config is invalid. Reloading is halted");

flb_reload_watchdog_cleanup(watchdog_ctx);
return FLB_RELOAD_HALTED;
}
}
Expand All @@ -499,7 +568,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
old_config->hot_reloading = FLB_FALSE;

flb_error("[reload] reloaded config format is invalid. Reloading is halted");

flb_reload_watchdog_cleanup(watchdog_ctx);
return FLB_RELOAD_HALTED;
}

Expand All @@ -512,7 +581,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
old_config->hot_reloading = FLB_FALSE;

flb_error("[reload] reloaded config is invalid. Reloading is halted");

flb_reload_watchdog_cleanup(watchdog_ctx);
return FLB_RELOAD_HALTED;
}

Expand Down Expand Up @@ -541,7 +610,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
old_config->hot_reloading = FLB_FALSE;

flb_error("[reload] loaded configuration contains error(s). Reloading is aborted");

flb_reload_watchdog_cleanup(watchdog_ctx);
return FLB_RELOAD_ABORTED;
}

Expand All @@ -550,6 +619,9 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts)
flb_debug("[reload] hot reloaded %d time(s)", reloaded_count);
new_config->hot_reloading = FLB_FALSE;
new_config->hot_reload_succeeded = FLB_TRUE;

/* Cancel the watchdog thread since reload completed successfully */
flb_reload_watchdog_cleanup(watchdog_ctx);

return 0;
}
Loading
Loading