41
41
from cluster import expected_precision_loss_by_query_reduction
42
42
from cluster import select_best_variant
43
43
import config
44
- from gp_query import ask_multi_query
44
+ from gp_query import ask_multi_query , \
45
+ variable_substitution_deep_narrow_mut_query
45
46
from gp_query import calibrate_query_timeout
46
47
from gp_query import combined_ask_count_multi_query
47
48
from gp_query import predict_query
@@ -420,7 +421,7 @@ def _mutate_expand_node_helper(node, pb_en_out_link=config.MUTPB_EN_OUT_LINK):
420
421
new_triple = (node , var_edge , var_node )
421
422
else :
422
423
new_triple = (var_node , var_edge , node )
423
- return new_triple , var_node
424
+ return new_triple , var_node , var_edge
424
425
425
426
426
427
def mutate_expand_node (child , node = None ):
@@ -433,11 +434,10 @@ def mutate_expand_node(child, node=None):
433
434
434
435
435
436
def mutate_deep_narrow_path (
436
- child ,
437
+ child , sparql , timeout , gtp_scores ,
437
438
min_len = config .MUTPB_DN_MIN_LEN ,
438
439
max_len = config .MUTPB_DN_MAX_LEN ,
439
440
term_pb = config .MUTPB_DN_TERM_PB ,
440
- pb_en_out_link = config .MUTPB_EN_OUT_LINK ,
441
441
):
442
442
assert isinstance (child , GraphPattern )
443
443
nodes = list (child .nodes )
@@ -451,15 +451,76 @@ def mutate_deep_narrow_path(
451
451
if hop >= max_len :
452
452
break
453
453
hop += 1
454
- new_triple , var_node = _mutate_expand_node_helper (start_node )
455
- gp += [new_triple ]
456
- start_node = var_node
454
+ new_triple , var_node , var_edge = _mutate_expand_node_helper (start_node )
455
+ test_gp = gp + [new_triple ]
456
+ test_gp , fixed = _mutate_deep_narrow_path_helper (
457
+ sparql , timeout , gtp_scores , test_gp , var_edge , var_node )
458
+ if fixed == 'Y' :
459
+ start_node = var_node
460
+ gp = test_gp
457
461
458
462
# TODO: insert connection to a target node
459
463
# TODO: fix edge or node ( to_count_var_over_values_query)
460
464
return gp
461
465
462
466
467
+ def _mutate_deep_narrow_path_helper (
468
+ sparql ,
469
+ timeout ,
470
+ gtp_scores ,
471
+ child ,
472
+ edge_var ,
473
+ node_var ,
474
+ gtp_sample_n = config .MUTPB_FV_RGTP_SAMPLE_N ,
475
+ limit_res = config .MUTPB_DN_QUERY_LIMIT ,
476
+ sample_n = config .MUTPB_FV_SAMPLE_MAXN ,
477
+ ):
478
+ assert isinstance (child , GraphPattern )
479
+ assert isinstance (gtp_scores , GTPScores )
480
+
481
+ # The further we get, the less gtps are remaining. Sampling too many (all)
482
+ # of them might hurt as common substitutions (> limit ones) which are dead
483
+ # ends could cover less common ones that could actually help
484
+ gtp_sample_n = min (gtp_sample_n , int (gtp_scores .remaining_gain ))
485
+ gtp_sample_n = random .randint (1 , gtp_sample_n )
486
+
487
+ ground_truth_pairs = gtp_scores .remaining_gain_sample_gtps (
488
+ n = gtp_sample_n )
489
+ t , substitution_counts = variable_substitution_deep_narrow_mut_query (
490
+ sparql , timeout , child , edge_var , node_var , ground_truth_pairs ,
491
+ limit_res )
492
+ if not substitution_counts :
493
+ # the current pattern is unfit, as we can't find anything fulfilling it
494
+ logger .debug ("tried to fix a var %s without result:\n %s"
495
+ "seems as if the pattern can't be fulfilled!" ,
496
+ edge_var , child .to_sparql_select_query ())
497
+ fixed = 'N'
498
+ return [child ], fixed
499
+ mutate_fix_var_filter (substitution_counts )
500
+ if not substitution_counts :
501
+ # could have happened that we removed the only possible substitution
502
+ fixed = 'N'
503
+ return [child ], fixed
504
+ # randomly pick n of the substitutions with a prob ~ to their counts
505
+ items , counts = zip (* substitution_counts .most_common ())
506
+ substs = sample_from_list (items , counts , sample_n )
507
+ logger .info (
508
+ 'fixed variable %s in %sto:\n %s\n <%d out of:\n %s\n ' ,
509
+ edge_var .n3 (),
510
+ child ,
511
+ '\n ' .join ([subst .n3 () for subst in substs ]),
512
+ sample_n ,
513
+ '\n ' .join ([' %d: %s' % (c , v .n3 ())
514
+ for v , c in substitution_counts .most_common ()]),
515
+ )
516
+ fixed = 'Y'
517
+ res = [
518
+ GraphPattern (child , mapping = {edge_var : subst })
519
+ for subst in substs
520
+ ]
521
+ return res , fixed
522
+
523
+
463
524
def mutate_add_edge (child ):
464
525
# TODO: can maybe be improved by sparqling
465
526
nodes = list (child .nodes )
@@ -682,6 +743,7 @@ def mutate(
682
743
pb_dt = config .MUTPB_DT ,
683
744
pb_en = config .MUTPB_EN ,
684
745
pb_fv = config .MUTPB_FV ,
746
+ pb_dn = config .MUTPB_DN ,
685
747
pb_id = config .MUTPB_ID ,
686
748
pb_iv = config .MUTPB_IV ,
687
749
pb_mv = config .MUTPB_MV ,
@@ -721,6 +783,9 @@ def mutate(
721
783
if random .random () < pb_sp :
722
784
child = mutate_simplify_pattern (child )
723
785
786
+ if random .random () < pb_dn :
787
+ child = mutate_deep_narrow_path (child , sparql , timeout , gtp_scores )
788
+
724
789
if random .random () < pb_fv :
725
790
child = canonicalize (child )
726
791
children = mutate_fix_var (sparql , timeout , gtp_scores , child )
0 commit comments