Skip to content

Commit

Permalink
Merge pull request #5803 from BOINC/dpa_script_val2
Browse files Browse the repository at this point in the history
validator: handle transient errors
  • Loading branch information
AenBleidd authored Sep 10, 2024
2 parents 8cb5497 + d94d625 commit a2b61ad
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 30 deletions.
2 changes: 1 addition & 1 deletion sched/db_purge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ int purge_and_archive_results(DB_WORKUNIT& wu, int& number_results) {
retval = result.delete_from_db();
if (retval) {
log_messages.printf(MSG_CRITICAL,
"Couldn't delete result [%d] from database\n", result.id
"Couldn't delete result [%lu] from database\n", result.id
);
return retval;
}
Expand Down
34 changes: 26 additions & 8 deletions sched/script_validator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ void validate_handler_usage() {
);
}

// see validate_util2.h for return values
//
int init_result(RESULT& result, void*&) {
if (init_script.empty()) {
return 0;
Expand All @@ -120,7 +122,6 @@ int init_result(RESULT& result, void*&) {
return retval;
}


char cmd[4096];
sprintf(cmd, "../bin/%s", init_script[0].c_str());
for (i=1; i<init_script.size(); i++) {
Expand All @@ -139,10 +140,19 @@ int init_result(RESULT& result, void*&) {
}
}
retval = system(cmd);
if (retval) {
return retval;
if (WIFEXITED(retval)) {
int s = WEXITSTATUS(retval);
if (!s) return 0;
if (s == VAL_RESULT_TRANSIENT_ERROR) {
return VAL_RESULT_TRANSIENT_ERROR;
}
log_messages.printf(MSG_CRITICAL,
"init script %s failed: %d\n", cmd, s
);
return -1;
}
return 0;
log_messages.printf(MSG_CRITICAL, "init script %s didn't exit\n", cmd);
return -1;
}

int compare_results(RESULT& r1, void*, RESULT const& r2, void*, bool& match) {
Expand Down Expand Up @@ -196,12 +206,20 @@ int compare_results(RESULT& r1, void*, RESULT const& r2, void*, bool& match) {
}
}
retval = system(cmd);
if (retval) {
if (WIFEXITED(retval)) {
int s = WEXITSTATUS(retval);
if (s == 0) {
match = true;
return 0;
}
if (s == VAL_RESULT_TRANSIENT_ERROR) {
return VAL_RESULT_TRANSIENT_ERROR;
}
match = false;
} else {
match = true;
return 0;
}
return 0;
log_messages.printf(MSG_CRITICAL, "compare script %s didn't exit\n", cmd);
return -1;
}

int cleanup_result(RESULT const&, void*) {
Expand Down
81 changes: 63 additions & 18 deletions sched/validate_util2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ int check_set(
) {
vector<void*> data;
vector<bool> had_error;
int i, j, neq = 0, n, retval;
int i, j, neq = 0, n, retval=0;
int min_valid = wu.min_quorum/2+1;

retry = false;
Expand All @@ -83,29 +83,35 @@ int check_set(
int suspicious_results = 0;
for (i=0; i<n; i++) {
retval = init_result(results[i], data[i]);
if (retval == ERR_OPENDIR) {
switch (retval) {
case 0:
good_results++;
break;
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
log_messages.printf(MSG_CRITICAL,
"check_set: init_result([RESULT#%lu %s]) transient failure\n",
results[i].id, results[i].name
);
retry = true;
had_error[i] = true;
} else if (retval == VAL_RESULT_SUSPICIOUS) {
break;
case VAL_RESULT_SUSPICIOUS:
log_messages.printf(MSG_NORMAL,
"[RESULT#%lu %s] considered to be suspicious\n",
results[i].id, results[i].name
);
suspicious_results++;
} else if (retval) {
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_set: init_result([RESULT#%lu %s]) failed: %s\n",
results[i].id, results[i].name, boincerror(retval)
);
results[i].outcome = RESULT_OUTCOME_VALIDATE_ERROR;
results[i].validate_state = VALIDATE_STATE_INVALID;
had_error[i] = true;
} else {
good_results++;
break;
}
}

Expand Down Expand Up @@ -141,14 +147,29 @@ int check_set(
if (i == j) {
++neq;
matches[j] = true;
} else if (compare_results(results[i], data[i], results[j], data[j], match)) {
continue;
}
retval = compare_results(
results[i], data[i], results[j], data[j], match
);
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
retry = true;
retval = 0;
goto cleanup;
case 0:
if (match) {
++neq;
matches[j] = true;
}
break;
default:
log_messages.printf(MSG_CRITICAL,
"generic_check_set: check_pair_with_data([RESULT#%lu %s], [RESULT#%lu %s]) failed\n",
"check_set(): compare_results([RESULT#%lu %s], [RESULT#%lu %s]) failed\n",
results[i].id, results[i].name, results[j].id, results[j].name
);
} else if (match) {
++neq;
matches[j] = true;
goto cleanup;
}
}
if (neq >= min_valid) {
Expand All @@ -164,14 +185,15 @@ int check_set(
}
}

retval = 0;
cleanup:

for (i=0; i<n; i++) {
cleanup_result(results[i], data[i]);
}
return 0;
return retval;
}

// a straggler instance has arrived after the WU is already validated.
// r1 is the new result; r2 is canonical result
//
void check_pair(RESULT& r1, RESULT& r2, bool& retry) {
Expand All @@ -182,14 +204,18 @@ void check_pair(RESULT& r1, RESULT& r2, bool& retry) {

retry = false;
retval = init_result(r1, data1);
if (retval == ERR_OPENDIR) {
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) transient failure 1\n",
r1.id, r1.name
);
retry = true;
return;
} else if (retval) {
case 0:
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) perm failure 1\n",
r1.id, r1.name
Expand All @@ -200,15 +226,19 @@ void check_pair(RESULT& r1, RESULT& r2, bool& retry) {
}

retval = init_result(r2, data2);
if (retval == ERR_OPENDIR) {
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) transient failure 2\n",
r2.id, r2.name
);
cleanup_result(r1, data1);
retry = true;
return;
} else if (retval) {
case 0:
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) perm failure2\n",
r2.id, r2.name
Expand All @@ -220,7 +250,22 @@ void check_pair(RESULT& r1, RESULT& r2, bool& retry) {
}

retval = compare_results(r1, data1, r2, data2, match);
r1.validate_state = match?VALIDATE_STATE_VALID:VALIDATE_STATE_INVALID;
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
retry = true;
break;
case 0:
r1.validate_state = match?VALIDATE_STATE_VALID:VALIDATE_STATE_INVALID;
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_pair: compare_results([RESULT#%lu RESULT#%lu]) perm failure\n",
r1.id, r2.id
);
r1.validate_state = VALIDATE_STATE_INVALID;
break;
}
cleanup_result(r1, data1);
cleanup_result(r2, data2);
}
14 changes: 11 additions & 3 deletions sched/validate_util2.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,20 @@

#include "boinc_db_types.h"

// special return values of init_result():
// return values of init_result() and compare_result()
// (in addition to usual error codes)
//
#define VAL_RESULT_SUSPICIOUS 1
// if an "adaptive replication" result looks suspicious
// the result looks 'suspicious'.
// if we're using adaptive replication and this is the only copy,
// create a 2nd replica rather than accepting it.
// (Are any projects using this?)
#define VAL_RESULT_LONG_TERM_FAIL 2
// host is unlikely to handle this app version; stop using
// host is unlikely to handle this app version; stop using it
#define VAL_RESULT_TRANSIENT_ERROR 3
// a transient error happened (e.g. couldn't read file due to NFS failure).
// Retry validation later.
// ERR_OPENDIR is also treated this way.

extern int init_result(RESULT&, void*&);
extern int compare_results(RESULT &, void*, RESULT const&, void*, bool&);
Expand Down

0 comments on commit a2b61ad

Please sign in to comment.