Skip to content

Commit 9fcb31f

Browse files
committed
Workaround "global" hang
In rabbit_node_monitor:disconnect/1, we change the "dist_auto_connect" kernel parameter to force a disconnection and give some time to all components to handle the subsequent "nodedown" event. "global" doesn't handle this situation very well. With an unfortunate sequence of messages and bad timings, this can trigger an inconsistency in its internal state. When this happens, global:sync() never returns. See bug 26556, comment #5 for a detailed description. The workaround consists of a process who parses the "global" internal state if global:sync/0 didn't return in 15 seconds. If the state contains in-progress synchronisation older than 10 seconds, the spawned process sends fake nodedown/nodeup events to "global" on both inconsistent nodes so they restart their synchronisation. This workaround will be removed once the real bugs are fixed and "dist_auto_connect" is left untouched.
1 parent 945b04f commit 9fcb31f

File tree

2 files changed

+67
-1
lines changed

2 files changed

+67
-1
lines changed

src/rabbit_mnesia.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ init() ->
109109
%% We intuitively expect the global name server to be synced when
110110
%% Mnesia is up. In fact that's not guaranteed to be the case -
111111
%% let's make it so.
112-
ok = global:sync(),
112+
ok = rabbit_node_monitor:global_sync(),
113113
ok.
114114

115115
init_from_config() ->

src/rabbit_node_monitor.erl

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
-export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]).
2727
-export([partitions/0, partitions/1, status/1, subscribe/1]).
2828
-export([pause_partition_guard/0]).
29+
-export([global_sync/0]).
2930

3031
%% gen_server callbacks
3132
-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
@@ -258,6 +259,71 @@ pause_if_all_down_guard(PreferredNodes, LastNodes, LastState) ->
258259
NewState
259260
end.
260261

262+
%%----------------------------------------------------------------------------
263+
%% "global" hang workaround.
264+
%%----------------------------------------------------------------------------
265+
266+
%% This code works around a possible inconsistency in the "global"
267+
%% state, causing global:sync/0 to never return.
268+
%%
269+
%% 1. A process is spawned.
270+
%% 2. If after 15", global:sync() didn't return, the "global"
271+
%% state is parsed.
272+
%% 3. If it detects that a sync is blocked for more than 10",
273+
%% the process sends fake nodedown/nodeup events to the two
274+
%% nodes involved (one local, one remote).
275+
%% 4. Both "global" instances restart their synchronisation.
276+
%% 5. globao:sync() finally returns.
277+
%%
278+
%% FIXME: Remove this workaround, once we got rid of the change to
279+
%% "dist_auto_connect" and fixed the bugs uncovered.
280+
281+
global_sync() ->
282+
Pid = spawn(fun workaround_global_hang/0),
283+
ok = global:sync(),
284+
Pid ! global_sync_done,
285+
ok.
286+
287+
workaround_global_hang() ->
288+
receive
289+
global_sync_done ->
290+
ok
291+
after 15000 ->
292+
find_blocked_global_peers()
293+
end.
294+
295+
find_blocked_global_peers() ->
296+
{status, _, _, [Dict | _]} = sys:get_status(global_name_server),
297+
find_blocked_global_peers1(Dict).
298+
299+
find_blocked_global_peers1([{{sync_tag_his, Peer}, Timestamp} | Rest]) ->
300+
Diff = timer:now_diff(erlang:now(), Timestamp),
301+
if
302+
Diff >= 10000 -> unblock_global_peer(Peer);
303+
true -> ok
304+
end,
305+
find_blocked_global_peers1(Rest);
306+
find_blocked_global_peers1([_ | Rest]) ->
307+
find_blocked_global_peers1(Rest);
308+
find_blocked_global_peers1([]) ->
309+
ok.
310+
311+
unblock_global_peer(PeerNode) ->
312+
ThisNode = node(),
313+
PeerState = rpc:call(PeerNode, sys, get_status, [global_name_server]),
314+
error_logger:info_msg(
315+
"Global hang workaround: global state on ~s seems broken~n"
316+
" * Peer global state: ~p~n"
317+
" * Local global state: ~p~n"
318+
"Faking nodedown/nodeup between ~s and ~s~n",
319+
[PeerNode, PeerState, sys:get_status(global_name_server),
320+
PeerNode, ThisNode]),
321+
{global_name_server, ThisNode} ! {nodedown, PeerNode},
322+
{global_name_server, PeerNode} ! {nodedown, ThisNode},
323+
{global_name_server, ThisNode} ! {nodeup, PeerNode},
324+
{global_name_server, PeerNode} ! {nodeup, ThisNode},
325+
ok.
326+
261327
%%----------------------------------------------------------------------------
262328
%% gen_server callbacks
263329
%%----------------------------------------------------------------------------

0 commit comments

Comments
 (0)