-
Notifications
You must be signed in to change notification settings - Fork 1.9k
reload: add a timeout mechanism #10869
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
efdacb9
c8a3263
70f9acf
80b1579
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,12 +32,16 @@ | |
| #include <fluent-bit/flb_utils.h> | ||
| #include <fluent-bit/flb_plugin.h> | ||
| #include <fluent-bit/flb_reload.h> | ||
| #include <fluent-bit/flb_time.h> | ||
|
|
||
| #include <cfl/cfl.h> | ||
| #include <cfl/cfl_sds.h> | ||
| #include <cfl/cfl_variant.h> | ||
| #include <cfl/cfl_kvlist.h> | ||
|
|
||
| #include <fluent-bit/flb_pthread.h> | ||
| #include <stdlib.h> | ||
|
|
||
| static int flb_input_propery_check_all(struct flb_config *config) | ||
| { | ||
| int ret; | ||
|
|
@@ -376,6 +380,66 @@ static int flb_reload_reinstantiate_external_plugins(struct flb_config *src, str | |
| return 0; | ||
| } | ||
|
|
||
| struct flb_reload_watchdog_ctx { | ||
| pthread_t tid; | ||
| int timeout_seconds; | ||
| }; | ||
|
|
||
| static void *hot_reload_watchdog_thread(void *arg) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this function should only called if the reload watchdog timer is enabled manually, the user must be aware about an explicit abort() in case Fluent Bit is being used in library mode and can generate other side effects
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense. I'm structured the code so that we return early without starting the thread if the feature is disabled and disabled it by default. |
||
| { | ||
| struct flb_reload_watchdog_ctx *ctx = (struct flb_reload_watchdog_ctx *)arg; | ||
|
|
||
| /* Set async cancellation type for (mostly) immediate response to pthread_cancel */ | ||
| pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); | ||
|
|
||
| flb_time_msleep(ctx->timeout_seconds * 1000); | ||
|
|
||
| flb_error("[hot_reload_watchdog] Hot reload timeout exceeded (%d seconds), " | ||
| "aborting to prevent indefinite hang", ctx->timeout_seconds); | ||
| abort(); | ||
| } | ||
|
|
||
| static struct flb_reload_watchdog_ctx *flb_reload_watchdog_start(struct flb_config *config) | ||
| { | ||
| struct flb_reload_watchdog_ctx *watchdog_ctx; | ||
| int ret; | ||
|
|
||
| if (config->hot_reload_watchdog_timeout_seconds <= 0) { | ||
| flb_debug("[reload] Hot reload watchdog disabled"); | ||
| return NULL; | ||
| } | ||
|
|
||
| watchdog_ctx = flb_malloc(sizeof(struct flb_reload_watchdog_ctx)); | ||
| if (!watchdog_ctx) { | ||
| flb_errno(); | ||
| return NULL; | ||
| } | ||
| watchdog_ctx->timeout_seconds = config->hot_reload_watchdog_timeout_seconds; | ||
|
|
||
| ret = pthread_create(&watchdog_ctx->tid, NULL, hot_reload_watchdog_thread, watchdog_ctx); | ||
| if (ret != 0) { | ||
| flb_error("[reload] Failed to create hot reload watchdog thread: %d", ret); | ||
| flb_free(watchdog_ctx); | ||
| return NULL; | ||
| } | ||
|
|
||
| flb_debug("[reload] Hot reload watchdog thread started"); | ||
| return watchdog_ctx; | ||
| } | ||
|
|
||
| static void flb_reload_watchdog_cleanup(struct flb_reload_watchdog_ctx *watchdog_ctx) | ||
| { | ||
| if (!watchdog_ctx) { | ||
| return; | ||
| } | ||
|
|
||
| pthread_cancel(watchdog_ctx->tid); | ||
| pthread_join(watchdog_ctx->tid, NULL); | ||
|
Comment on lines
+436
to
+437
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i didn't think any more sophisticated synchronization (around the abort call) was worth the complexity cost but open to opinions (or knowledge i dont have, not great at c) |
||
| flb_debug("[reload] Hot reload watchdog thread cancelled"); | ||
|
|
||
| flb_free(watchdog_ctx); | ||
| } | ||
|
|
||
| int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | ||
| { | ||
| int ret; | ||
|
|
@@ -387,6 +451,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| struct flb_cf *original_cf; | ||
| int verbose; | ||
| int reloaded_count = 0; | ||
| struct flb_reload_watchdog_ctx *watchdog_ctx = NULL; | ||
|
|
||
| if (ctx == NULL) { | ||
| flb_error("[reload] given flb context is NULL"); | ||
|
|
@@ -417,6 +482,9 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| (long unsigned int) getpid(), | ||
| (void *) pthread_self()); | ||
|
|
||
| /* Start the watchdog thread */ | ||
| watchdog_ctx = flb_reload_watchdog_start(old_config); | ||
|
|
||
| if (old_config->conf_path_file) { | ||
| file = flb_sds_create(old_config->conf_path_file); | ||
| } | ||
|
|
@@ -427,6 +495,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| } | ||
| flb_cf_destroy(new_cf); | ||
| flb_error("[reload] reconstruct cf failed"); | ||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return FLB_RELOAD_HALTED; | ||
| } | ||
| } | ||
|
|
@@ -439,7 +508,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| } | ||
| flb_cf_destroy(new_cf); | ||
| flb_error("[reload] creating flb context is failed. Reloading is halted"); | ||
|
|
||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
| return FLB_RELOAD_HALTED; | ||
| } | ||
|
|
||
|
|
@@ -469,7 +538,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| if (!new_cf) { | ||
| flb_sds_destroy(file); | ||
| old_config->hot_reloading = FLB_FALSE; | ||
|
|
||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
| return FLB_RELOAD_HALTED; | ||
| } | ||
| } | ||
|
|
@@ -485,7 +554,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| flb_destroy(new_ctx); | ||
| old_config->hot_reloading = FLB_FALSE; | ||
| flb_error("[reload] reloaded config is invalid. Reloading is halted"); | ||
|
|
||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
| return FLB_RELOAD_HALTED; | ||
| } | ||
| } | ||
|
|
@@ -499,7 +568,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| old_config->hot_reloading = FLB_FALSE; | ||
|
|
||
| flb_error("[reload] reloaded config format is invalid. Reloading is halted"); | ||
|
|
||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
| return FLB_RELOAD_HALTED; | ||
| } | ||
|
|
||
|
|
@@ -512,7 +581,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| old_config->hot_reloading = FLB_FALSE; | ||
|
|
||
| flb_error("[reload] reloaded config is invalid. Reloading is halted"); | ||
|
|
||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
| return FLB_RELOAD_HALTED; | ||
| } | ||
|
|
||
|
|
@@ -541,7 +610,7 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| old_config->hot_reloading = FLB_FALSE; | ||
|
|
||
| flb_error("[reload] loaded configuration contains error(s). Reloading is aborted"); | ||
|
|
||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
| return FLB_RELOAD_ABORTED; | ||
| } | ||
|
|
||
|
|
@@ -550,6 +619,9 @@ int flb_reload(flb_ctx_t *ctx, struct flb_cf *cf_opts) | |
| flb_debug("[reload] hot reloaded %d time(s)", reloaded_count); | ||
| new_config->hot_reloading = FLB_FALSE; | ||
| new_config->hot_reload_succeeded = FLB_TRUE; | ||
|
|
||
| /* Cancel the watchdog thread since reload completed successfully */ | ||
| flb_reload_watchdog_cleanup(watchdog_ctx); | ||
|
|
||
| return 0; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
happy to wrap all this in #ifdef FLB_TESTS_INTERNAL, just want to make sure that's what we want first