37
37
hash (r:: RRID , h:: UInt ) = hash (r. whence, hash (r. id, h))
38
38
== (r:: RRID , s:: RRID ) = (r. whence== s. whence && r. id== s. id)
39
39
40
+ # # Wire format description
41
+ #
42
+ # Each message has three parts, which are written in order to the worker's stream.
43
+ # 1) A header of type MsgHeader is serialized to the stream (via `serialize`).
44
+ # 2) A message of type AbstractMsg is then serialized.
45
+ # 3) Finally, a fixed bounday of 10 bytes is written.
46
+
47
+ # Message header stored separately from body to be able to send back errors if
48
+ # a deserialization error occurs when reading the message body.
49
+ type MsgHeader
50
+ response_oid:: RRID
51
+ notify_oid:: RRID
52
+ end
53
+
54
+ # Special oid (0,0) uses to indicate a null ID.
55
+ # Used instead of Nullable to decrease wire size of header.
56
+ null_id (id) = id == RRID (0 , 0 )
57
+
58
+ MsgHeader (;response_oid:: RRID = RRID (0 ,0 ), notify_oid:: RRID = RRID (0 ,0 )) =
59
+ MsgHeader (response_oid, notify_oid)
40
60
41
61
type CallMsg{Mode} <: AbstractMsg
42
62
f:: Function
43
63
args:: Tuple
44
64
kwargs:: Array
45
- response_oid:: RRID
46
65
end
47
66
type CallWaitMsg <: AbstractMsg
48
67
f:: Function
49
68
args:: Tuple
50
69
kwargs:: Array
51
- response_oid:: RRID
52
- notify_oid:: RRID
53
70
end
54
71
type RemoteDoMsg <: AbstractMsg
55
72
f:: Function
56
73
args:: Tuple
57
74
kwargs:: Array
58
75
end
59
76
type ResultMsg <: AbstractMsg
60
- response_oid:: RRID
61
77
value:: Any
62
78
end
63
79
80
+
64
81
# Worker initialization messages
65
82
type IdentifySocketMsg <: AbstractMsg
66
83
from_pid:: Int
70
87
type JoinPGRPMsg <: AbstractMsg
71
88
self_pid:: Int
72
89
other_workers:: Array
73
- notify_oid:: RRID
74
90
topology:: Symbol
75
91
worker_pool
76
92
end
77
93
type JoinCompleteMsg <: AbstractMsg
78
- notify_oid:: RRID
79
94
cpu_cores:: Int
80
95
ospid:: Int
81
96
end
82
97
83
- function send_msg_unknown (s:: IO , msg)
98
+ function send_msg_unknown (s:: IO , header, msg)
84
99
error (" attempt to send to unknown socket" )
85
100
end
86
101
87
- function send_msg (s:: IO , msg)
102
+ function send_msg (s:: IO , header, msg)
88
103
id = worker_id_from_socket (s)
89
104
if id > - 1
90
- return send_msg (worker_from_id (id), msg)
105
+ return send_msg (worker_from_id (id), header, msg)
91
106
end
92
- send_msg_unknown (s, msg)
107
+ send_msg_unknown (s, header, msg)
93
108
end
94
109
95
- function send_msg_now (s:: IO , msg:: AbstractMsg )
110
+ function send_msg_now (s:: IO , msghdr, msg:: AbstractMsg )
96
111
id = worker_id_from_socket (s)
97
112
if id > - 1
98
- return send_msg_now (worker_from_id (id), msg)
113
+ return send_msg_now (worker_from_id (id), msghdr, msg)
99
114
end
100
- send_msg_unknown (s, msg)
115
+ send_msg_unknown (s, msghdr, msg)
101
116
end
102
117
103
118
abstract ClusterManager
@@ -197,12 +212,12 @@ function set_worker_state(w, state)
197
212
notify (w. c_state; all= true )
198
213
end
199
214
200
- function send_msg_now (w:: Worker , msg)
201
- send_msg_ (w, msg, true )
215
+ function send_msg_now (w:: Worker , msghdr, msg)
216
+ send_msg_ (w, msghdr, msg, true )
202
217
end
203
218
204
- function send_msg (w:: Worker , msg)
205
- send_msg_ (w, msg, false )
219
+ function send_msg (w:: Worker , msghdr, msg)
220
+ send_msg_ (w, msghdr, msg, false )
206
221
end
207
222
208
223
function flush_gc_msgs (w:: Worker )
@@ -241,14 +256,20 @@ function check_worker_state(w::Worker)
241
256
end
242
257
end
243
258
259
+ # Boundary inserted between messages on the wire, used for recovering
260
+ # from deserialization errors. Picked arbitrarily.
261
+ # A size of 10 bytes indicates ~ ~1e24 possible boundaries, so chance of collision with message contents is trivial.
262
+ const MSG_BOUNDARY = UInt8[0x79 , 0x8e , 0x8e , 0xf5 , 0x6e , 0x9b , 0x2e , 0x97 , 0xd5 , 0x7d ]
244
263
245
- function send_msg_ (w:: Worker , msg, now:: Bool )
264
+ function send_msg_ (w:: Worker , header, msg, now:: Bool )
246
265
check_worker_state (w)
247
266
io = w. w_stream
248
267
lock (io. lock)
249
268
try
250
269
reset_state (w. w_serializer)
270
+ serialize (w. w_serializer, header)
251
271
serialize (w. w_serializer, msg) # io is wrapped in w_serializer
272
+ write (io, MSG_BOUNDARY)
252
273
253
274
if ! now && w. gcflag
254
275
flush_gc_msgs (w)
@@ -768,7 +789,6 @@ function showerror(io::IO, re::RemoteException)
768
789
showerror (io, re. captured)
769
790
end
770
791
771
-
772
792
function run_work_thunk (thunk, print_error)
773
793
local result
774
794
try
811
831
function remotecall (f, w:: Worker , args... ; kwargs... )
812
832
rr = Future (w)
813
833
# println("$(myid()) asking for $rr")
814
- send_msg (w, CallMsg {:call} (f, args, kwargs, remoteref_id (rr) ))
834
+ send_msg (w, MsgHeader (response_oid = remoteref_id (rr)), CallMsg {:call} (f, args, kwargs))
815
835
rr
816
836
end
817
837
@@ -829,7 +849,7 @@ function remotecall_fetch(f, w::Worker, args...; kwargs...)
829
849
oid = RRID ()
830
850
rv = lookup_ref (oid)
831
851
rv. waitingfor = w. id
832
- send_msg (w, CallMsg {:call_fetch} (f, args, kwargs, oid ))
852
+ send_msg (w, MsgHeader (response_oid = oid), CallMsg {:call_fetch} (f, args, kwargs))
833
853
v = take! (rv)
834
854
delete! (PGRP. refs, oid)
835
855
isa (v, RemoteException) ? throw (v) : v
@@ -846,7 +866,7 @@ function remotecall_wait(f, w::Worker, args...; kwargs...)
846
866
rv = lookup_ref (prid)
847
867
rv. waitingfor = w. id
848
868
rr = Future (w)
849
- send_msg (w, CallWaitMsg (f, args, kwargs, remoteref_id (rr), prid ))
869
+ send_msg (w, MsgHeader (response_oid = remoteref_id (rr), notify_oid = prid), CallWaitMsg (f, args, kwargs ))
850
870
v = fetch (rv. c)
851
871
delete! (PGRP. refs, prid)
852
872
isa (v, RemoteException) && throw (v)
@@ -866,7 +886,7 @@ function remote_do(f, w::LocalProcess, args...; kwargs...)
866
886
end
867
887
868
888
function remote_do (f, w:: Worker , args... ; kwargs... )
869
- send_msg (w, RemoteDoMsg (f, args, kwargs))
889
+ send_msg (w, MsgHeader (), RemoteDoMsg (f, args, kwargs))
870
890
nothing
871
891
end
872
892
@@ -952,13 +972,13 @@ close(rr::RemoteChannel) = call_on_owner(close_ref, rr)
952
972
953
973
function deliver_result (sock:: IO , msg, oid, value)
954
974
# print("$(myid()) sending result $oid\n")
955
- if is (msg,:call_fetch ) || isa (value, RemoteException)
975
+ if is (msg, :call_fetch ) || isa (value, RemoteException)
956
976
val = value
957
977
else
958
978
val = :OK
959
979
end
960
980
try
961
- send_msg_now (sock, ResultMsg ( oid, val))
981
+ send_msg_now (sock, MsgHeader (response_oid = oid), ResultMsg ( val))
962
982
catch e
963
983
# terminate connection in case of serialization error
964
984
# otherwise the reading end would hang
@@ -996,28 +1016,73 @@ function process_messages(r_stream::IO, w_stream::IO, incoming=true)
996
1016
end
997
1017
998
1018
function message_handler_loop (r_stream:: IO , w_stream:: IO , incoming:: Bool )
1019
+ wpid= 0 # the worker r_stream is connected to.
1020
+ boundary = similar (MSG_BOUNDARY)
999
1021
try
1000
1022
version = process_hdr (r_stream, incoming)
1001
1023
serializer = ClusterSerializer (r_stream)
1024
+
1025
+ # The first message will associate wpid with r_stream
1026
+ msghdr = deserialize (serializer)
1027
+ msg = deserialize (serializer)
1028
+ readbytes! (r_stream, boundary, length (MSG_BOUNDARY))
1029
+
1030
+ handle_msg (msg, msghdr, r_stream, w_stream, version)
1031
+ wpid = worker_id_from_socket (r_stream)
1032
+
1033
+ @assert wpid > 0
1034
+
1002
1035
while true
1003
1036
reset_state (serializer)
1004
- msg = deserialize (serializer)
1005
- # println("got msg: ", msg)
1006
- handle_msg (msg, r_stream, w_stream, version)
1037
+ msghdr = deserialize (serializer)
1038
+ # println("msghdr: ", msghdr)
1039
+
1040
+ try
1041
+ msg = deserialize (serializer)
1042
+ catch e
1043
+ # Deserialization error; discard bytes in stream until boundary found
1044
+ boundary_idx = 1
1045
+ while true
1046
+ # This may throw an EOF error if the terminal boundary was not written
1047
+ # correctly, triggering the higher-scoped catch block below
1048
+ byte = read (r_stream, UInt8)
1049
+ if byte == MSG_BOUNDARY[boundary_idx]
1050
+ boundary_idx += 1
1051
+ if boundary_idx > length (MSG_BOUNDARY)
1052
+ break
1053
+ end
1054
+ else
1055
+ boundary_idx = 1
1056
+ end
1057
+ end
1058
+ # println("Deserialization error.")
1059
+ remote_err = RemoteException (myid (), CapturedException (e, catch_backtrace ()))
1060
+ if ! null_id (msghdr. response_oid)
1061
+ ref = lookup_ref (msghdr. response_oid)
1062
+ put! (ref, remote_err)
1063
+ end
1064
+ if ! null_id (msghdr. notify_oid)
1065
+ deliver_result (w_stream, :call_fetch , msghdr. notify_oid, remote_err)
1066
+ end
1067
+ continue
1068
+ end
1069
+ readbytes! (r_stream, boundary, length (MSG_BOUNDARY))
1070
+
1071
+ # println("got msg: ", typeof(msg))
1072
+ handle_msg (msg, msghdr, r_stream, w_stream, version)
1007
1073
end
1008
1074
catch e
1009
1075
# println(STDERR, "Process($(myid())) - Exception ", e)
1010
- iderr = worker_id_from_socket (r_stream)
1011
- if (iderr < 1 )
1076
+ if (wpid < 1 )
1012
1077
println (STDERR, e)
1013
1078
println (STDERR, " Process($(myid ()) ) - Unknown remote, closing connection." )
1014
1079
else
1015
- werr = worker_from_id (iderr )
1080
+ werr = worker_from_id (wpid )
1016
1081
oldstate = werr. state
1017
1082
set_worker_state (werr, W_TERMINATED)
1018
1083
1019
- # If error occured talking to pid 1, commit harakiri
1020
- if iderr == 1
1084
+ # If unhandleable error occured talking to pid 1, exit
1085
+ if wpid == 1
1021
1086
if isopen (w_stream)
1022
1087
print (STDERR, " fatal error on " , myid (), " : " )
1023
1088
display_error (e, catch_backtrace ())
@@ -1028,15 +1093,15 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
1028
1093
# Will treat any exception as death of node and cleanup
1029
1094
# since currently we do not have a mechanism for workers to reconnect
1030
1095
# to each other on unhandled errors
1031
- deregister_worker (iderr )
1096
+ deregister_worker (wpid )
1032
1097
end
1033
1098
1034
1099
isopen (r_stream) && close (r_stream)
1035
1100
isopen (w_stream) && close (w_stream)
1036
1101
1037
- if (myid () == 1 ) && (iderr > 1 )
1102
+ if (myid () == 1 ) && (wpid > 1 )
1038
1103
if oldstate != W_TERMINATING
1039
- println (STDERR, " Worker $iderr terminated." )
1104
+ println (STDERR, " Worker $wpid terminated." )
1040
1105
rethrow (e)
1041
1106
end
1042
1107
end
@@ -1071,44 +1136,44 @@ function process_hdr(s, validate_cookie)
1071
1136
return VersionNumber (strip (String (version)))
1072
1137
end
1073
1138
1074
- function handle_msg (msg:: CallMsg{:call} , r_stream, w_stream, version)
1075
- schedule_call (msg . response_oid, ()-> msg. f (msg. args... ; msg. kwargs... ))
1139
+ function handle_msg (msg:: CallMsg{:call} , msghdr, r_stream, w_stream, version)
1140
+ schedule_call (msghdr . response_oid, ()-> msg. f (msg. args... ; msg. kwargs... ))
1076
1141
end
1077
- function handle_msg (msg:: CallMsg{:call_fetch} , r_stream, w_stream, version)
1142
+ function handle_msg (msg:: CallMsg{:call_fetch} , msghdr, r_stream, w_stream, version)
1078
1143
@schedule begin
1079
1144
v = run_work_thunk (()-> msg. f (msg. args... ; msg. kwargs... ), false )
1080
- deliver_result (w_stream, :call_fetch , msg . response_oid, v)
1145
+ deliver_result (w_stream, :call_fetch , msghdr . response_oid, v)
1081
1146
end
1082
1147
end
1083
1148
1084
- function handle_msg (msg:: CallWaitMsg , r_stream, w_stream, version)
1149
+ function handle_msg (msg:: CallWaitMsg , msghdr, r_stream, w_stream, version)
1085
1150
@schedule begin
1086
- rv = schedule_call (msg . response_oid, ()-> msg. f (msg. args... ; msg. kwargs... ))
1087
- deliver_result (w_stream, :call_wait , msg . notify_oid, fetch (rv. c))
1151
+ rv = schedule_call (msghdr . response_oid, ()-> msg. f (msg. args... ; msg. kwargs... ))
1152
+ deliver_result (w_stream, :call_wait , msghdr . notify_oid, fetch (rv. c))
1088
1153
end
1089
1154
end
1090
1155
1091
- function handle_msg (msg:: RemoteDoMsg , r_stream, w_stream, version)
1156
+ function handle_msg (msg:: RemoteDoMsg , msghdr, r_stream, w_stream, version)
1092
1157
@schedule run_work_thunk (()-> msg. f (msg. args... ; msg. kwargs... ), true )
1093
1158
end
1094
1159
1095
- function handle_msg (msg:: ResultMsg , r_stream, w_stream, version)
1096
- put! (lookup_ref (msg . response_oid), msg. value)
1160
+ function handle_msg (msg:: ResultMsg , msghdr, r_stream, w_stream, version)
1161
+ put! (lookup_ref (msghdr . response_oid), msg. value)
1097
1162
end
1098
1163
1099
- function handle_msg (msg:: IdentifySocketMsg , r_stream, w_stream, version)
1164
+ function handle_msg (msg:: IdentifySocketMsg , msghdr, r_stream, w_stream, version)
1100
1165
# register a new peer worker connection
1101
1166
w= Worker (msg. from_pid, r_stream, w_stream, cluster_manager; version= version)
1102
1167
send_connection_hdr (w, false )
1103
- send_msg_now (w, IdentifySocketAckMsg ())
1168
+ send_msg_now (w, MsgHeader (), IdentifySocketAckMsg ())
1104
1169
end
1105
1170
1106
- function handle_msg (msg:: IdentifySocketAckMsg , r_stream, w_stream, version)
1171
+ function handle_msg (msg:: IdentifySocketAckMsg , msghdr, r_stream, w_stream, version)
1107
1172
w = map_sock_wrkr[r_stream]
1108
1173
w. version = version
1109
1174
end
1110
1175
1111
- function handle_msg (msg:: JoinPGRPMsg , r_stream, w_stream, version)
1176
+ function handle_msg (msg:: JoinPGRPMsg , msghdr, r_stream, w_stream, version)
1112
1177
LPROC. id = msg. self_pid
1113
1178
controller = Worker (1 , r_stream, w_stream, cluster_manager; version= version)
1114
1179
register_worker (LPROC)
@@ -1129,7 +1194,7 @@ function handle_msg(msg::JoinPGRPMsg, r_stream, w_stream, version)
1129
1194
1130
1195
set_default_worker_pool (msg. worker_pool)
1131
1196
send_connection_hdr (controller, false )
1132
- send_msg_now (controller, JoinCompleteMsg (msg . notify_oid, Sys. CPU_CORES, getpid ()))
1197
+ send_msg_now (controller, MsgHeader (notify_oid = msghdr . notify_oid), JoinCompleteMsg ( Sys. CPU_CORES, getpid ()))
1133
1198
end
1134
1199
1135
1200
function connect_to_peer (manager:: ClusterManager , rpid:: Int , wconfig:: WorkerConfig )
@@ -1138,23 +1203,23 @@ function connect_to_peer(manager::ClusterManager, rpid::Int, wconfig::WorkerConf
1138
1203
w = Worker (rpid, r_s, w_s, manager; config= wconfig)
1139
1204
process_messages (w. r_stream, w. w_stream, false )
1140
1205
send_connection_hdr (w, true )
1141
- send_msg_now (w, IdentifySocketMsg (myid ()))
1206
+ send_msg_now (w, MsgHeader (), IdentifySocketMsg (myid ()))
1142
1207
catch e
1143
1208
display_error (e, catch_backtrace ())
1144
1209
println (STDERR, " Error [$e ] on $(myid ()) while connecting to peer $rpid . Exiting." )
1145
1210
exit (1 )
1146
1211
end
1147
1212
end
1148
1213
1149
- function handle_msg (msg:: JoinCompleteMsg , r_stream, w_stream, version)
1214
+ function handle_msg (msg:: JoinCompleteMsg , msghdr, r_stream, w_stream, version)
1150
1215
w = map_sock_wrkr[r_stream]
1151
1216
environ = get (w. config. environ, Dict ())
1152
1217
environ[:cpu_cores ] = msg. cpu_cores
1153
1218
w. config. environ = environ
1154
1219
w. config. ospid = msg. ospid
1155
1220
w. version = version
1156
1221
1157
- ntfy_channel = lookup_ref (msg . notify_oid)
1222
+ ntfy_channel = lookup_ref (msghdr . notify_oid)
1158
1223
put! (ntfy_channel, w. id)
1159
1224
1160
1225
push! (default_worker_pool (), w)
@@ -1478,7 +1543,7 @@ function create_worker(manager, wconfig)
1478
1543
1479
1544
all_locs = map (x -> isa (x, Worker) ? (get (x. config. connect_at, ()), x. id) : ((), x. id, true ), join_list)
1480
1545
send_connection_hdr (w, true )
1481
- send_msg_now (w, JoinPGRPMsg (w. id, all_locs, ntfy_oid , PGRP. topology, default_worker_pool ()))
1546
+ send_msg_now (w, MsgHeader (notify_oid = ntfy_oid), JoinPGRPMsg (w. id, all_locs, PGRP. topology, default_worker_pool ()))
1482
1547
1483
1548
@schedule manage (w. manager, w. id, w. config, :register )
1484
1549
wait (rr_ntfy_join)
0 commit comments