@@ -184,7 +184,7 @@ def get_collection_ids(collections):
184
184
return coll_ids
185
185
186
186
187
- def get_input_output_maps (transform_id , work ):
187
+ def get_input_output_maps (transform_id , work , with_deps = True ):
188
188
# link collections
189
189
input_collections = work .get_input_collections ()
190
190
output_collections = work .get_output_collections ()
@@ -202,7 +202,8 @@ def get_input_output_maps(transform_id, work):
202
202
input_coll_ids = input_coll_ids ,
203
203
output_coll_ids = output_coll_ids ,
204
204
log_coll_ids = log_coll_ids ,
205
- with_sub_map_id = work .with_sub_map_id ())
205
+ with_sub_map_id = work .with_sub_map_id (),
206
+ with_deps = with_deps )
206
207
207
208
# work_name_to_coll_map = core_transforms.get_work_name_to_coll_map(request_id=transform['request_id'])
208
209
# work.set_work_name_to_coll_map(work_name_to_coll_map)
@@ -603,7 +604,7 @@ def handle_new_processing(processing, agent_attributes, func_site_to_cloud=None,
603
604
update_collections .append (u_coll )
604
605
605
606
if proc .submitted_at :
606
- input_output_maps = get_input_output_maps (transform_id , work )
607
+ input_output_maps = get_input_output_maps (transform_id , work , with_deps = False )
607
608
new_input_output_maps = work .get_new_input_output_maps (input_output_maps )
608
609
request_id = processing ['request_id' ]
609
610
transform_id = processing ['transform_id' ]
@@ -700,7 +701,7 @@ def get_input_output_sub_maps(inputs, outputs, inputs_dependency, logs=[]):
700
701
return input_output_sub_maps
701
702
702
703
703
- def get_updated_contents_by_input_output_maps (input_output_maps = None , terminated = False , max_updates_per_round = 2000 , logger = None , log_prefix = '' ):
704
+ def get_updated_contents_by_input_output_maps (input_output_maps = None , terminated = False , max_updates_per_round = 2000 , with_deps = False , logger = None , log_prefix = '' ):
704
705
updated_contents , updated_contents_full_input , updated_contents_full_output = [], [], []
705
706
updated_contents_full_input_deps = []
706
707
new_update_contents = []
@@ -762,10 +763,12 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, terminated
762
763
inputs_dependency_sub = input_output_sub_maps [sub_map_id ]['inputs_dependency' ]
763
764
764
765
input_content_update_status = None
765
- if is_all_contents_available (inputs_dependency_sub ):
766
- input_content_update_status = ContentStatus .Available
767
- elif is_all_contents_terminated (inputs_dependency_sub , terminated ):
768
- input_content_update_status = ContentStatus .Missing
766
+ if with_deps :
767
+ # if deps are not loaded. This part should not be executed. Otherwise it will release all jobs
768
+ if is_all_contents_available (inputs_dependency_sub ):
769
+ input_content_update_status = ContentStatus .Available
770
+ elif is_all_contents_terminated (inputs_dependency_sub , terminated ):
771
+ input_content_update_status = ContentStatus .Missing
769
772
if input_content_update_status :
770
773
for content in inputs_sub :
771
774
if content ['substatus' ] != input_content_update_status :
@@ -1187,7 +1190,7 @@ def handle_update_processing(processing, agent_attributes, max_updates_per_round
1187
1190
work = proc .work
1188
1191
work .set_agent_attributes (agent_attributes , processing )
1189
1192
1190
- input_output_maps = get_input_output_maps (transform_id , work )
1193
+ input_output_maps = get_input_output_maps (transform_id , work , with_deps = False )
1191
1194
logger .debug (log_prefix + "get_input_output_maps: len: %s" % len (input_output_maps ))
1192
1195
logger .debug (log_prefix + "get_input_output_maps.keys[:3]: %s" % str (list (input_output_maps .keys ())[:3 ]))
1193
1196
@@ -1414,11 +1417,35 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates=
1414
1417
request_id = processing ['request_id' ]
1415
1418
transform_id = processing ['transform_id' ]
1416
1419
workload_id = processing ['workload_id' ]
1420
+ processing_id = processing ['processing_id' ]
1417
1421
1418
1422
proc = processing ['processing_metadata' ]['processing' ]
1419
1423
work = proc .work
1420
1424
work .set_agent_attributes (agent_attributes , processing )
1421
1425
1426
+ num_dependencies = None
1427
+ num_inputs = None
1428
+ default_input_dep_page_size = 500
1429
+ min_input_dep_page_size = 100
1430
+ max_dependencies = 5000
1431
+ try :
1432
+ num_inputs = work .num_inputs
1433
+ num_dependencies = work .num_dependencies
1434
+ if num_inputs is not None and num_dependencies is not None and num_dependencies > 0 :
1435
+ input_dep_page_size = int (max_dependencies * num_inputs / num_dependencies )
1436
+ if input_dep_page_size < default_input_dep_page_size :
1437
+ default_input_dep_page_size = input_dep_page_size
1438
+ log_info = f"input_dep_page_size ({ input_dep_page_size } ) is smaller than default_input_dep_page_size ({ default_input_dep_page_size } ),"
1439
+ log_info = "update default_input_dep_page_size from input_dep_page_size"
1440
+ logger .info (log_info )
1441
+ if default_input_dep_page_size < min_input_dep_page_size :
1442
+ log_info = f"default_input_dep_page_size ({ default_input_dep_page_size } ) is smaller than min_input_dep_page_size ({ min_input_dep_page_size } ),"
1443
+ log_info = "update default_input_dep_page_size from min_input_dep_page_size"
1444
+ logger .info (log_info )
1445
+ default_input_dep_page_size = min_input_dep_page_size
1446
+ except Exception as ex :
1447
+ logger .warn (f"request_id ({ request_id } ) transform_id ({ transform_id } ) processing_id ({ processing_id } ) fails to get num_dependencies: { ex } " )
1448
+
1422
1449
if (not work .use_dependency_to_release_jobs ()) or workload_id is None :
1423
1450
return processing ['substatus' ], [], [], {}, {}, {}, [], [], has_updates
1424
1451
else :
@@ -1467,6 +1494,7 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates=
1467
1494
core_catalog .delete_contents_update (request_id = request_id , transform_id = transform_id , fetch = True )
1468
1495
logger .debug (log_prefix + "sync contents_update to contents done" )
1469
1496
1497
+ """
1470
1498
logger.debug(log_prefix + "update_contents_from_others_by_dep_id")
1471
1499
# core_catalog.update_contents_from_others_by_dep_id(request_id=request_id, transform_id=transform_id)
1472
1500
to_triggered_contents = core_catalog.get_update_contents_from_others_by_dep_id(request_id=request_id, transform_id=transform_id)
@@ -1490,19 +1518,38 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates=
1490
1518
wait_futures_finish(ret_futures, "update_contents_from_others_by_dep_id", logger, log_prefix)
1491
1519
1492
1520
logger.debug(log_prefix + "update_contents_from_others_by_dep_id done")
1521
+ """
1493
1522
1494
- input_output_maps = get_input_output_maps (transform_id , work )
1495
- logger .debug (log_prefix + "input_output_maps.keys[:2]: %s" % str (list (input_output_maps .keys ())[:2 ]))
1523
+ logger .debug (log_prefix + "update_contents_from_others_by_dep_id_pages" )
1524
+ status_not_to_check = [ContentStatus .Available , ContentStatus .FakeAvailable ,
1525
+ ContentStatus .FinalFailed , ContentStatus .Missing ]
1526
+ core_catalog .update_contents_from_others_by_dep_id_pages (request_id = request_id , transform_id = transform_id ,
1527
+ page_size = 1000 , status_not_to_check = status_not_to_check )
1528
+ logger .debug (log_prefix + "update_contents_from_others_by_dep_id_pages done" )
1496
1529
1497
1530
terminated_processing = False
1498
1531
terminated_status = [ProcessingStatus .Finished , ProcessingStatus .Failed , ProcessingStatus .SubFinished ,
1499
1532
ProcessingStatus .Terminating , ProcessingStatus .Cancelled ]
1500
1533
if processing ['status' ] in terminated_status or processing ['substatus' ] in terminated_status :
1501
1534
terminated_processing = True
1502
1535
1536
+ logger .debug (log_prefix + "update_input_contents_by_dependency_pages" )
1537
+ status_not_to_check = [ContentStatus .Available , ContentStatus .FakeAvailable ,
1538
+ ContentStatus .FinalFailed , ContentStatus .Missing ]
1539
+ core_catalog .update_input_contents_by_dependency_pages (request_id = request_id , transform_id = transform_id ,
1540
+ page_size = default_input_dep_page_size ,
1541
+ terminated = terminated_processing ,
1542
+ batch_size = 1000 , status_not_to_check = status_not_to_check )
1543
+ logger .debug (log_prefix + "update_input_contents_by_dependency_pages done" )
1544
+
1545
+ with_deps = False
1546
+ input_output_maps = get_input_output_maps (transform_id , work , with_deps = with_deps )
1547
+ logger .debug (log_prefix + "input_output_maps.keys[:2]: %s" % str (list (input_output_maps .keys ())[:2 ]))
1548
+
1503
1549
updated_contents_ret_chunks = get_updated_contents_by_input_output_maps (input_output_maps = input_output_maps ,
1504
1550
terminated = terminated_processing ,
1505
1551
max_updates_per_round = max_updates_per_round ,
1552
+ with_deps = with_deps ,
1506
1553
logger = logger ,
1507
1554
log_prefix = log_prefix )
1508
1555
@@ -1913,7 +1960,7 @@ def handle_messages_processing(messages, logger=None, log_prefix='', update_proc
1913
1960
def sync_collection_status (request_id , transform_id , workload_id , work , input_output_maps = None ,
1914
1961
close_collection = False , force_close_collection = False , abort = False , terminate = False ):
1915
1962
if input_output_maps is None :
1916
- input_output_maps = get_input_output_maps (transform_id , work )
1963
+ input_output_maps = get_input_output_maps (transform_id , work , with_deps = False )
1917
1964
1918
1965
all_updates_flushed = True
1919
1966
coll_status = {}
@@ -2116,7 +2163,7 @@ def sync_processing(processing, agent_attributes, terminate=False, abort=False,
2116
2163
work .set_agent_attributes (agent_attributes , processing )
2117
2164
2118
2165
messages = []
2119
- input_output_maps = get_input_output_maps (transform_id , work )
2166
+ input_output_maps = get_input_output_maps (transform_id , work , with_deps = False )
2120
2167
if processing ['substatus' ] in terminated_status or processing ['substatus' ] in terminated_status :
2121
2168
terminate = True
2122
2169
update_collections , all_updates_flushed , msgs = sync_collection_status (request_id , transform_id , workload_id , work ,
@@ -2225,7 +2272,7 @@ def handle_resume_processing(processing, agent_attributes, logger=None, log_pref
2225
2272
'substatus' : CollectionStatus .Open }
2226
2273
update_collections .append (u_collection )
2227
2274
2228
- input_output_maps = get_input_output_maps (transform_id , work )
2275
+ input_output_maps = get_input_output_maps (transform_id , work , with_deps = False )
2229
2276
update_contents = reactive_contents (request_id , transform_id , workload_id , work , input_output_maps )
2230
2277
2231
2278
processing ['status' ] = ProcessingStatus .Running
0 commit comments