61
61
# Worker initialization messages
62
62
type IdentifySocketMsg <: AbstractMsg
63
63
from_pid:: Int
64
+ cookie:: AbstractString
65
+ end
66
+ type IdentifySocketAckMsg <: AbstractMsg
67
+ cookie:: AbstractString
64
68
end
65
69
type JoinPGRPMsg <: AbstractMsg
66
70
self_pid:: Int
67
71
other_workers:: Array
68
- self_is_local:: Bool
69
72
notify_oid:: RRID
70
73
topology:: Symbol
71
74
worker_pool
75
+ cookie:: AbstractString
72
76
end
73
77
type JoinCompleteMsg <: AbstractMsg
74
78
notify_oid:: RRID
75
79
cpu_cores:: Int
76
80
ospid:: Int
81
+ cookie:: AbstractString
77
82
end
78
83
79
84
@@ -270,11 +275,15 @@ type LocalProcess
270
275
id:: Int
271
276
bind_addr:: AbstractString
272
277
bind_port:: UInt16
278
+ cookie:: AbstractString
273
279
LocalProcess () = new (1 )
274
280
end
275
281
276
282
const LPROC = LocalProcess ()
277
283
284
+ cluster_cookie () = LPROC. cookie
285
+ cluster_cookie (cookie) = (LPROC. cookie = cookie; cookie)
286
+
278
287
const map_pid_wrkr = Dict {Int, Union{Worker, LocalProcess}} ()
279
288
const map_sock_wrkr = ObjectIdDict ()
280
289
const map_del_wrkr = Set {Int} ()
@@ -962,19 +971,25 @@ end
962
971
process_messages (r_stream:: IO , w_stream:: IO ) = @schedule message_handler_loop (r_stream, w_stream)
963
972
964
973
function message_handler_loop (r_stream:: IO , w_stream:: IO )
965
- global PGRP
966
- global cluster_manager
967
-
968
974
try
975
+ # Check for a valid first message with a cookie.
976
+ msg = deserialize (r_stream)
977
+ if ! any (x-> isa (msg, x), [JoinPGRPMsg, JoinCompleteMsg, IdentifySocketMsg, IdentifySocketAckMsg]) ||
978
+ (msg. cookie != cluster_cookie ())
979
+
980
+ println (STDERR, " Unknown first message $(typeof (msg)) or cookie mismatch." )
981
+ error (" Invalid connection credentials." )
982
+ end
983
+
969
984
while true
985
+ handle_msg (msg, r_stream, w_stream)
970
986
msg = deserialize (r_stream)
971
987
# println("got msg: ", msg)
972
- handle_msg (msg, r_stream, w_stream)
973
988
end
974
989
catch e
975
990
iderr = worker_id_from_socket (r_stream)
976
991
if (iderr < 1 )
977
- print (STDERR, " Socket from unknown remote worker in worker " , myid ())
992
+ println (STDERR, " Socket from unknown remote worker in worker $( myid ()) " )
978
993
else
979
994
werr = worker_from_id (iderr)
980
995
oldstate = werr. state
@@ -995,8 +1010,8 @@ function message_handler_loop(r_stream::IO, w_stream::IO)
995
1010
deregister_worker (iderr)
996
1011
end
997
1012
998
- if isopen (r_stream) close (r_stream) end
999
- if isopen (w_stream) close (w_stream) end
1013
+ isopen (r_stream) && close (r_stream)
1014
+ isopen (w_stream) && close (w_stream)
1000
1015
1001
1016
if (myid () == 1 ) && (iderr > 1 )
1002
1017
if oldstate != W_TERMINATING
@@ -1028,7 +1043,12 @@ handle_msg(msg::RemoteDoMsg, r_stream, w_stream) = @schedule run_work_thunk(()->
1028
1043
1029
1044
handle_msg (msg:: ResultMsg , r_stream, w_stream) = put! (lookup_ref (msg. response_oid), msg. value)
1030
1045
1031
- handle_msg (msg:: IdentifySocketMsg , r_stream, w_stream) = Worker (msg. from_pid, r_stream, w_stream, cluster_manager)
1046
+ function handle_msg (msg:: IdentifySocketMsg , r_stream, w_stream)
1047
+ # register a new peer worker connection
1048
+ w= Worker (msg. from_pid, r_stream, w_stream, cluster_manager)
1049
+ send_msg_now (w, IdentifySocketAckMsg (cluster_cookie ()))
1050
+ end
1051
+ handle_msg (msg:: IdentifySocketAckMsg , r_stream, w_stream) = nothing
1032
1052
1033
1053
function handle_msg (msg:: JoinPGRPMsg , r_stream, w_stream)
1034
1054
LPROC. id = msg. self_pid
@@ -1037,10 +1057,9 @@ function handle_msg(msg::JoinPGRPMsg, r_stream, w_stream)
1037
1057
topology (msg. topology)
1038
1058
1039
1059
wait_tasks = Task[]
1040
- for (connect_at, rpid, r_is_local ) in msg. other_workers
1060
+ for (connect_at, rpid) in msg. other_workers
1041
1061
wconfig = WorkerConfig ()
1042
1062
wconfig. connect_at = connect_at
1043
- wconfig. environ = AnyDict (:self_is_local => msg. self_is_local, :r_is_local => r_is_local)
1044
1063
1045
1064
let rpid= rpid, wconfig= wconfig
1046
1065
t = @async connect_to_peer (cluster_manager, rpid, wconfig)
@@ -1052,24 +1071,24 @@ function handle_msg(msg::JoinPGRPMsg, r_stream, w_stream)
1052
1071
1053
1072
set_default_worker_pool (msg. worker_pool)
1054
1073
1055
- send_msg_now (controller, JoinCompleteMsg (msg. notify_oid, Sys. CPU_CORES, getpid ()))
1074
+ send_msg_now (controller, JoinCompleteMsg (msg. notify_oid, Sys. CPU_CORES, getpid (), cluster_cookie () ))
1056
1075
end
1057
1076
1058
1077
function connect_to_peer (manager:: ClusterManager , rpid:: Int , wconfig:: WorkerConfig )
1059
1078
try
1060
1079
(r_s, w_s) = connect (manager, rpid, wconfig)
1061
1080
w = Worker (rpid, r_s, w_s, manager, wconfig)
1062
1081
process_messages (w. r_stream, w. w_stream)
1063
- send_msg_now (w, IdentifySocketMsg (myid ()))
1082
+ send_msg_now (w, IdentifySocketMsg (myid (), cluster_cookie () ))
1064
1083
catch e
1084
+ display_error (e, catch_backtrace ())
1065
1085
println (STDERR, " Error [$e ] on $(myid ()) while connecting to peer $rpid . Exiting." )
1066
1086
exit (1 )
1067
1087
end
1068
1088
end
1069
1089
1070
1090
function handle_msg (msg:: JoinCompleteMsg , r_stream, w_stream)
1071
1091
w = map_sock_wrkr[r_stream]
1072
-
1073
1092
environ = get (w. config. environ, Dict ())
1074
1093
environ[:cpu_cores ] = msg. cpu_cores
1075
1094
w. config. environ = environ
@@ -1093,8 +1112,8 @@ worker_timeout() = parse(Float64, get(ENV, "JULIA_WORKER_TIMEOUT", "60.0"))
1093
1112
# The entry point for julia worker processes. does not return. Used for TCP transport.
1094
1113
# Cluster managers implementing their own transport will provide their own.
1095
1114
# Argument is descriptor to write listening port # to.
1096
- start_worker () = start_worker (STDOUT)
1097
- function start_worker (out:: IO )
1115
+ start_worker (cookie :: AbstractString ) = start_worker (STDOUT, cookie )
1116
+ function start_worker (out:: IO , cookie :: AbstractString )
1098
1117
# we only explicitly monitor worker STDOUT on the console, so redirect
1099
1118
# stderr to stdout so we can see the output.
1100
1119
# at some point we might want some or all worker output to go to log
@@ -1103,12 +1122,13 @@ function start_worker(out::IO)
1103
1122
# exit when process 1 shut down. Don't yet know why.
1104
1123
# redirect_stderr(STDOUT)
1105
1124
1106
- init_worker ()
1125
+ init_worker (cookie)
1126
+ interface = IPv4 (LPROC. bind_addr)
1107
1127
if LPROC. bind_port == 0
1108
- (actual_port,sock) = listenany (UInt16 (9009 ))
1128
+ (actual_port,sock) = listenany (interface, UInt16 (9009 ))
1109
1129
LPROC. bind_port = actual_port
1110
1130
else
1111
- sock = listen (LPROC. bind_port)
1131
+ sock = listen (interface, LPROC. bind_port)
1112
1132
end
1113
1133
@schedule while isopen (sock)
1114
1134
client = accept (sock)
@@ -1180,7 +1200,7 @@ function parse_connection_info(str)
1180
1200
end
1181
1201
end
1182
1202
1183
- function init_worker (manager:: ClusterManager = DefaultClusterManager ())
1203
+ function init_worker (cookie :: AbstractString , manager:: ClusterManager = DefaultClusterManager ())
1184
1204
# On workers, the default cluster manager connects via TCP sockets. Custom
1185
1205
# transports will need to call this function with their own manager.
1186
1206
global cluster_manager
@@ -1195,6 +1215,9 @@ function init_worker(manager::ClusterManager=DefaultClusterManager())
1195
1215
# System is started in head node mode, cleanup entries related to the same
1196
1216
empty! (PGRP. workers)
1197
1217
empty! (map_pid_wrkr)
1218
+
1219
+ cluster_cookie (cookie)
1220
+ nothing
1198
1221
end
1199
1222
1200
1223
@@ -1391,8 +1414,8 @@ function create_worker(manager, wconfig)
1391
1414
end
1392
1415
end
1393
1416
1394
- all_locs = map (x -> isa (x, Worker) ? (get (x. config. connect_at, ()), x. id, isa (x . manager, LocalManager) ) : ((), x. id, true ), join_list)
1395
- send_msg_now (w, JoinPGRPMsg (w. id, all_locs, isa (w . manager, LocalManager), ntfy_oid, PGRP. topology, default_worker_pool ()))
1417
+ all_locs = map (x -> isa (x, Worker) ? (get (x. config. connect_at, ()), x. id) : ((), x. id, true ), join_list)
1418
+ send_msg_now (w, JoinPGRPMsg (w. id, all_locs, ntfy_oid, PGRP. topology, default_worker_pool (), cluster_cookie ()))
1396
1419
1397
1420
@schedule manage (w. manager, w. id, w. config, :register )
1398
1421
wait (rr_ntfy_join)
0 commit comments