From 26d788f1e76e3d827fbb4108a82a439c80c0feba Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 9 May 2016 08:48:56 -0600 Subject: [PATCH 01/61] Multiple communicators --- src/mpi/mpi_caf.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 55b288df2..a73c47ad4 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -104,6 +104,10 @@ char err_buffer[MPI_MAX_ERROR_STRING]; MPI_COMM_WORLD for interoperability purposes. */ MPI_Comm CAF_COMM_WORLD; +/* Replicated communicators used by Failed Images */ +MPI_Comm *communicators; +int used_comm = -1; + /* For MPI interoperability, allow external initialization (and thus finalization) of MPI. */ bool caf_owns_mpi = false; @@ -379,6 +383,11 @@ PREFIX (init) (int *argc, char ***argv) stat_tok = malloc (sizeof(MPI_Win)); + communicators = (MPI_Comm *)calloc(caf_num_images,sizeof(MPI_Comm)); + + for(i=0;i= 3 MPI_Info_create (&mpi_info_same_size); MPI_Info_set (mpi_info_same_size, "same_size", "true"); From 2ba109e1a0d38b18a5521e5834501155d1e37274 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 9 May 2016 09:58:07 -0600 Subject: [PATCH 02/61] fail image and communicator error handling --- src/libcaf.h | 1 + src/mpi/mpi_caf.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/libcaf.h b/src/libcaf.h index 1ea065ca1..d7174874e 100644 --- a/src/libcaf.h +++ b/src/libcaf.h @@ -59,6 +59,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define STAT_LOCKED_OTHER_IMAGE 2 #define STAT_DUP_SYNC_IMAGES 3 #define STAT_STOPPED_IMAGE 6000 +#define STAT_FAILED_IMAGE 6001 /* Describes what type of array we are registerring. Keep in sync with gcc/fortran/trans.h. */ diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index a73c47ad4..b3406975c 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -41,6 +41,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include +#include +#include #include "libcaf.h" @@ -104,9 +106,40 @@ char err_buffer[MPI_MAX_ERROR_STRING]; MPI_COMM_WORLD for interoperability purposes. */ MPI_Comm CAF_COMM_WORLD; -/* Replicated communicators used by Failed Images */ +/* Failed Images */ MPI_Comm *communicators; int used_comm = -1; +int *ranks_gc,*ranks_gf; //to be returned by failed images +MPI_Errhandler errh; + +static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ + MPI_Comm comm; + int nf,nc,i; + MPI_Group group_c, group_f; + comm = *pcomm; + + memset(ranks_gf,0,sizeof(int)*caf_num_images); + memset(ranks_gc,0,sizeof(int)*caf_num_images); + + MPIX_Comm_failure_ack(comm); + MPIX_Comm_failure_get_acked(comm, &group_f); + MPI_Group_size(group_f, &nf); + MPI_Comm_group(comm, &group_c); + for(i = 0; i < nf; i++) + ranks_gf[i] = i; + MPI_Group_translate_ranks(group_f, nf, ranks_gf, + group_c, ranks_gc); + for(i = 0; i < nf; i++) + { + ranks_gc[i] = i+1; + printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); + } + + used_comm++; + CAF_COMM_WORLD = communicators[used_comm]; + /* MPI_Barrier(CAF_COMM_WORLD); */ + /* printf("%d after barrier %d\n",caf_this_image,used_comm); */ +} /* For MPI interoperability, allow external initialization (and thus finalization) of MPI. */ @@ -385,8 +418,17 @@ PREFIX (init) (int *argc, char ***argv) communicators = (MPI_Comm *)calloc(caf_num_images,sizeof(MPI_Comm)); + MPI_Comm_create_errhandler(verbose_comm_errhandler, &errh); + MPI_Comm_set_errhandler(CAF_COMM_WORLD, errh); + for(i=0;i= 3 MPI_Info_create (&mpi_info_same_size); @@ -2535,3 +2577,10 @@ PREFIX (error_stop) (int32_t error) fprintf (stderr, "ERROR STOP %d\n", error); error_stop (error); } + +void +PREFIX (fail_image) (void) +{ + /* *img_status = STAT_FAILED_IMAGE; */ + raise(SIGKILL); +} From b6688c1ed2953387fbf6c8d6d35500e6aee00f20 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 9 May 2016 11:04:29 -0600 Subject: [PATCH 03/61] MPI Win error handling --- src/mpi/mpi_caf.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index b3406975c..c02a8c7b0 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -110,7 +110,13 @@ MPI_Comm CAF_COMM_WORLD; MPI_Comm *communicators; int used_comm = -1; int *ranks_gc,*ranks_gf; //to be returned by failed images -MPI_Errhandler errh; +MPI_Errhandler errh,errh_w; + +static void verbose_win_errhandler(MPI_Win* win, int* err, ...) { + /* printf("in win err handler\n"); */ + /* used_comm++; */ + /* CAF_COMM_WORLD = communicators[used_comm]; */ +} static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ MPI_Comm comm; @@ -132,13 +138,11 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ for(i = 0; i < nf; i++) { ranks_gc[i] = i+1; - printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); + /* printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); */ } - used_comm++; - CAF_COMM_WORLD = communicators[used_comm]; - /* MPI_Barrier(CAF_COMM_WORLD); */ - /* printf("%d after barrier %d\n",caf_this_image,used_comm); */ + //used_comm++; + //CAF_COMM_WORLD = communicators[used_comm]; } /* For MPI interoperability, allow external initialization @@ -421,6 +425,8 @@ PREFIX (init) (int *argc, char ***argv) MPI_Comm_create_errhandler(verbose_comm_errhandler, &errh); MPI_Comm_set_errhandler(CAF_COMM_WORLD, errh); + MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w); + for(i=0;i Date: Mon, 9 May 2016 11:05:44 -0600 Subject: [PATCH 04/61] MPI Win error handling --- src/mpi/mpi_caf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index c02a8c7b0..1ed586226 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -138,7 +138,7 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ for(i = 0; i < nf; i++) { ranks_gc[i] = i+1; - /* printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); */ + printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); } //used_comm++; From 9226f3acce1ce8244491b3d0081b2bd82498d648 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 9 May 2016 11:15:44 -0600 Subject: [PATCH 05/61] Failed image number detected --- src/mpi/mpi_caf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 1ed586226..0d7b1b557 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -137,10 +137,10 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ group_c, ranks_gc); for(i = 0; i < nf; i++) { - ranks_gc[i] = i+1; + ranks_gc[i]++; printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); } - + //used_comm++; //CAF_COMM_WORLD = communicators[used_comm]; } From 6dc12a4ef9dac6b5608e157eb09f4e998d1592a5 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 9 May 2016 11:33:08 -0600 Subject: [PATCH 06/61] Image_status --- src/mpi/mpi_caf.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 0d7b1b557..6cbdd3fc5 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -108,7 +108,7 @@ MPI_Comm CAF_COMM_WORLD; /* Failed Images */ MPI_Comm *communicators; -int used_comm = -1; +int used_comm = -1, n_failed_imgs=0; int *ranks_gc,*ranks_gf; //to be returned by failed images MPI_Errhandler errh,errh_w; @@ -138,9 +138,11 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ for(i = 0; i < nf; i++) { ranks_gc[i]++; - printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); + /* printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); */ } + n_failed_imgs = nf; + //used_comm++; //CAF_COMM_WORLD = communicators[used_comm]; } @@ -449,6 +451,7 @@ PREFIX (init) (int *argc, char ***argv) MPI_Win_create(img_status, sizeof(int), 1, MPI_INFO_NULL, CAF_COMM_WORLD, stat_tok); #endif // MPI_VERSION *img_status = 0; + MPI_Win_set_errhandler(*stat_tok,errh_w); } /* MPI_Barrier(CAF_COMM_WORLD); */ } @@ -2592,3 +2595,33 @@ PREFIX (fail_image) (void) /* *img_status = STAT_FAILED_IMAGE; */ raise(SIGKILL); } + +int +PREFIX (image_status) (int image) +{ + int i,res=0, remote_stat=0,ierr; + + for(i=0;i Date: Mon, 9 May 2016 11:35:06 -0600 Subject: [PATCH 07/61] Disabling general error detection in image_status --- src/mpi/mpi_caf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 6cbdd3fc5..49e6d03bd 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -2620,8 +2620,11 @@ PREFIX (image_status) (int image) if(remote_stat != 0) res = STAT_STOPPED_IMAGE; - if(ierr != MPI_SUCCESS) - res = 1; + /* if(ierr != MPI_SUCCESS) */ + /* { */ + /* printf("error %d\n",ierr); */ + /* res = 1; */ + /* } */ return res; } From a99a2ccf0e4d780e6e44d911951dd016a7b433d0 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 9 May 2016 13:39:21 -0600 Subject: [PATCH 08/61] Supporting stat_stopped_image in image_status --- src/mpi/mpi_caf.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 49e6d03bd..d02bf2818 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -143,8 +143,8 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ n_failed_imgs = nf; - //used_comm++; - //CAF_COMM_WORLD = communicators[used_comm]; + /* used_comm++; */ + /* CAF_COMM_WORLD = communicators[used_comm]; */ } /* For MPI interoperability, allow external initialization @@ -2620,11 +2620,8 @@ PREFIX (image_status) (int image) if(remote_stat != 0) res = STAT_STOPPED_IMAGE; - /* if(ierr != MPI_SUCCESS) */ - /* { */ - /* printf("error %d\n",ierr); */ - /* res = 1; */ - /* } */ + if(ierr != MPI_SUCCESS) + res = 1; return res; } From c01a05ab553842f29ac86bcbdc3d5cd41b0c8286 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 9 May 2016 15:09:59 -0600 Subject: [PATCH 09/61] Resuming multiple communicators --- src/mpi/mpi_caf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index d02bf2818..6aea28ebd 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -143,8 +143,8 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ n_failed_imgs = nf; - /* used_comm++; */ - /* CAF_COMM_WORLD = communicators[used_comm]; */ + used_comm++; + CAF_COMM_WORLD = communicators[used_comm]; } /* For MPI interoperability, allow external initialization From 8cdc90e8885d57ef2279a6217ce1616e5901a150 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 11 May 2016 13:08:58 -0600 Subject: [PATCH 10/61] Failed images draft --- src/mpi/mpi_caf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 6aea28ebd..066f166c5 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -2625,3 +2625,15 @@ PREFIX (image_status) (int image) return res; } + +void * +PREFIX (failed_images) (int *num_failed_images, int team __attribute__ ((unused)), + int kind __attribute__ ((unused))) +{ + void *mem; + printf("Failed images:%d\n",n_failed_imgs); + *num_failed_images = n_failed_imgs; + mem = calloc(n_failed_imgs,sizeof(int)); + memcpy(mem,ranks_gc,n_failed_imgs*sizeof(int)); + return mem; +} From d6640d9137f8e05a320b26b7e4b1eeeaa4782088 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 16 May 2016 10:04:41 -0600 Subject: [PATCH 11/61] Shrinking communicator --- src/mpi/mpi_caf.c | 106 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 15 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 066f166c5..bc5186993 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -107,10 +107,11 @@ char err_buffer[MPI_MAX_ERROR_STRING]; MPI_Comm CAF_COMM_WORLD; /* Failed Images */ -MPI_Comm *communicators; -int used_comm = -1, n_failed_imgs=0; +/* MPI_Comm *communicators; */ +int used_comm = -1, n_failed_imgs=0, error_called=0; int *ranks_gc,*ranks_gf; //to be returned by failed images MPI_Errhandler errh,errh_w; +int completed = 0; static void verbose_win_errhandler(MPI_Win* win, int* err, ...) { /* printf("in win err handler\n"); */ @@ -142,9 +143,9 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ } n_failed_imgs = nf; - - used_comm++; - CAF_COMM_WORLD = communicators[used_comm]; + error_called = 1; + /* used_comm++; */ + /* CAF_COMM_WORLD = communicators[used_comm]; */ } /* For MPI interoperability, allow external initialization @@ -422,18 +423,18 @@ PREFIX (init) (int *argc, char ***argv) stat_tok = malloc (sizeof(MPI_Win)); - communicators = (MPI_Comm *)calloc(caf_num_images,sizeof(MPI_Comm)); + /* communicators = (MPI_Comm *)calloc(caf_num_images,sizeof(MPI_Comm)); */ MPI_Comm_create_errhandler(verbose_comm_errhandler, &errh); MPI_Comm_set_errhandler(CAF_COMM_WORLD, errh); MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w); - for(i=0;i ns ) MPI_Abort(CAF_COMM_WORLD, MPI_ERR_PROC_FAILED); */ + + /* /\* remembering the former rank: we will reassign the same */ + /* * ranks in the new world. *\/ */ + /* MPI_Comm_rank(CAF_COMM_WORLD, &crank); */ + + /* /\* the rank 0 in the shrinked comm is going to determine the */ + /* * ranks at which the spares need to be inserted. *\/ */ + /* if(0 == srank) { */ + /* /\* getting the group of dead processes: */ + /* * those in comm, but not in shrinked are the deads *\/ */ + /* MPI_Comm_group(CAF_COMM_WORLD, &cgrp); MPI_Comm_group(shrunk, &sgrp); */ + /* MPI_Group_difference(cgrp, sgrp, &dgrp); MPI_Group_size(dgrp, &nd); */ + /* /\* Computing the rank assignment for the newly inserted spares *\/ */ + /* for(i=0; i= 3 MPI_Win_allocate(actual_size, 1, mpi_info_same_size, CAF_COMM_WORLD, &mem, *token); # ifndef CAF_MPI_LOCK_UNLOCK @@ -728,6 +798,13 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { int ierr=0; + if(error_called == 1) + { + /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ + communicator_shrink(); + error_called = 0; + } + if (unlikely (caf_is_finalized)) ierr = STAT_STOPPED_IMAGE; else @@ -2626,14 +2703,13 @@ PREFIX (image_status) (int image) return res; } -void * +int * PREFIX (failed_images) (int *num_failed_images, int team __attribute__ ((unused)), int kind __attribute__ ((unused))) { - void *mem; - printf("Failed images:%d\n",n_failed_imgs); + int *mem; *num_failed_images = n_failed_imgs; - mem = calloc(n_failed_imgs,sizeof(int)); + mem = (int *)calloc(n_failed_imgs,sizeof(int)); memcpy(mem,ranks_gc,n_failed_imgs*sizeof(int)); return mem; } From 17699e555d93b15e22ad55d4188e79f5e2059eed Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 16 May 2016 12:59:20 -0600 Subject: [PATCH 12/61] Partial fault tolerant support for locks --- src/mpi/mpi_caf.c | 59 ++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index bc5186993..d2899e52a 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -268,7 +268,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, { const char msg[] = "Already locked"; #if MPI_VERSION >= 3 - int value=1, compare = 0, newval = caf_this_image, i = 1; + int value=1, compare = 0, newval = caf_this_image, i = 1,zero=0; if(stat != NULL) *stat = 0; @@ -290,8 +290,26 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, while(value != 0) { locking_atomic_op(win, &value, newval, compare, image_index, index); - usleep(caf_this_image*i); - i++; + printf("n_failed_images: %d\n",n_failed_imgs); + for(i=0;iprev; @@ -552,21 +570,21 @@ static int communicator_shrink() /* /\* the rank 0 in the shrinked comm is going to determine the */ /* * ranks at which the spares need to be inserted. *\/ */ - /* if(0 == srank) { */ - /* /\* getting the group of dead processes: */ - /* * those in comm, but not in shrinked are the deads *\/ */ - /* MPI_Comm_group(CAF_COMM_WORLD, &cgrp); MPI_Comm_group(shrunk, &sgrp); */ - /* MPI_Group_difference(cgrp, sgrp, &dgrp); MPI_Group_size(dgrp, &nd); */ - /* /\* Computing the rank assignment for the newly inserted spares *\/ */ - /* for(i=0; i Date: Wed, 18 May 2016 13:17:40 -0600 Subject: [PATCH 13/61] Partial patch for locks --- src/mpi/mpi_caf.c | 110 +++++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index d2899e52a..e34e9c702 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -107,11 +107,12 @@ char err_buffer[MPI_MAX_ERROR_STRING]; MPI_Comm CAF_COMM_WORLD; /* Failed Images */ -/* MPI_Comm *communicators; */ +MPI_Comm lock_comm; +MPI_Request lock_req; int used_comm = -1, n_failed_imgs=0, error_called=0; int *ranks_gc,*ranks_gf; //to be returned by failed images MPI_Errhandler errh,errh_w; -int completed = 0; +int completed = 0,tmp_lock; static void verbose_win_errhandler(MPI_Win* win, int* err, ...) { /* printf("in win err handler\n"); */ @@ -251,11 +252,12 @@ caf_runtime_error (const char *message, ...) /* inline */ void locking_atomic_op(MPI_Win win, int *value, int newval, int compare, int image_index, int index) { + int ret; # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_lock (MPI_LOCK_EXCLUSIVE, image_index-1, 0, win); # endif // CAF_MPI_LOCK_UNLOCK - MPI_Compare_and_swap (&newval,&compare,value, MPI_INT,image_index-1, - index*sizeof(int), win); + ret = MPI_Compare_and_swap (&newval,&compare,value, MPI_INT,image_index-1, + index*sizeof(int), win); # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_unlock (image_index-1, win); # else // CAF_MPI_LOCK_UNLOCK @@ -268,11 +270,29 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, { const char msg[] = "Already locked"; #if MPI_VERSION >= 3 - int value=1, compare = 0, newval = caf_this_image, i = 1,zero=0; + int value=1, compare = 0, newval = caf_this_image, i = 1,zero=0,ret=0; + int flag, it = 0, check_failure = 100; if(stat != NULL) *stat = 0; + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + /* if(error_called == 1) */ + /* { */ + /* /\* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); *\/ */ + /* communicator_shrink(&CAF_COMM_WORLD); */ + /* error_called = 0; */ + /* } */ + + if(error_called == 1) + { + /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ + communicator_shrink(&lock_comm); + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + } + locking_atomic_op(win, &value, newval, compare, image_index, index); if(value == caf_this_image && image_index == caf_this_image) @@ -289,11 +309,26 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, while(value != 0) { + it++; + + if(it == check_failure) + { + it = 1; + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + } + + if(error_called == 1) + { + /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ + communicator_shrink(&lock_comm); + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + } + locking_atomic_op(win, &value, newval, compare, image_index, index); - printf("n_failed_images: %d\n",n_failed_imgs); + for(i=0;i ns ) MPI_Abort(CAF_COMM_WORLD, MPI_ERR_PROC_FAILED); */ - - /* /\* remembering the former rank: we will reassign the same */ - /* * ranks in the new world. *\/ */ - /* MPI_Comm_rank(CAF_COMM_WORLD, &crank); */ - - /* /\* the rank 0 in the shrinked comm is going to determine the */ - /* * ranks at which the spares need to be inserted. *\/ */ - /* if(0 == srank) { */ - /* /\* getting the group of dead processes: */ - /* * those in comm, but not in shrinked are the deads *\/ */ - /* MPI_Comm_group(CAF_COMM_WORLD, &cgrp); MPI_Comm_group(shrunk, &sgrp); */ - /* MPI_Group_difference(cgrp, sgrp, &dgrp); MPI_Group_size(dgrp, &nd); */ - /* /\* Computing the rank assignment for the newly inserted spares *\/ */ - /* for(i=0; i Date: Wed, 18 May 2016 13:55:24 -0600 Subject: [PATCH 14/61] Partial patch for locks --- src/mpi/mpi_caf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index e34e9c702..8c99878f3 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -288,7 +288,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ - communicator_shrink(&lock_comm); + /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } @@ -320,7 +320,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ - communicator_shrink(&lock_comm); + //communicator_shrink(&lock_comm); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } @@ -591,7 +591,7 @@ int communicator_shrink(MPI_Comm *comm) /* Split does the magic: removing spare processes and reordering ranks * so that all surviving processes remain at their former place */ rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, newcomm); - + /* Split or some of the communications above may have failed if * new failures have disrupted the process: we need to * make sure we succeeded at all ranks, or retry until it works. */ @@ -650,8 +650,8 @@ void * if(error_called == 1) { /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ + //communicator_shrink(&lock_comm); communicator_shrink(&CAF_COMM_WORLD); - communicator_shrink(&lock_comm); error_called = 0; } @@ -819,8 +819,8 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) if(error_called == 1) { /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ + //communicator_shrink(&lock_comm); communicator_shrink(&CAF_COMM_WORLD); - communicator_shrink(&lock_comm); error_called = 0; } From 23d2f478c6d9bd8197ea5b6b475bb79cc1e84f5f Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 18 May 2016 14:07:42 -0600 Subject: [PATCH 15/61] Minor changes --- src/mpi/mpi_caf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 8c99878f3..102a6abe2 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -287,7 +287,6 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { - /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; @@ -319,7 +318,6 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { - /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ //communicator_shrink(&lock_comm); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; @@ -518,6 +516,10 @@ PREFIX (finalize) (void) completed = 1; + printf("finalizing\n"); + + MPI_Cancel(&lock_req); + MPI_Request_free(&lock_req); MPI_Barrier(CAF_COMM_WORLD); while (caf_static_list != NULL) @@ -649,8 +651,7 @@ void * if(error_called == 1) { - /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ - //communicator_shrink(&lock_comm); + /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } @@ -818,8 +819,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) if(error_called == 1) { - /* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); */ - //communicator_shrink(&lock_comm); + /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } From b9d0761344982f992a64e65c7f6575d0ab078309 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 18 May 2016 14:15:07 -0600 Subject: [PATCH 16/61] Debug message as last instruction --- src/mpi/mpi_caf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 102a6abe2..04bb6034d 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -516,8 +516,6 @@ PREFIX (finalize) (void) completed = 1; - printf("finalizing\n"); - MPI_Cancel(&lock_req); MPI_Request_free(&lock_req); MPI_Barrier(CAF_COMM_WORLD); @@ -557,6 +555,7 @@ PREFIX (finalize) (void) pthread_mutex_lock(&lock_am); caf_is_finalized = 1; pthread_mutex_unlock(&lock_am); + printf("finalizing\n"); exit(0); } From b3d2f9ed2ee3dee1d7bbacc16be53d4fe9341c4d Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 19 May 2016 11:04:53 -0600 Subject: [PATCH 17/61] Working version of failed_images --- src/mpi/mpi_caf.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 04bb6034d..fc7dd139c 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -2721,13 +2721,16 @@ PREFIX (image_status) (int image) return res; } -int * -PREFIX (failed_images) (int *num_failed_images, int team __attribute__ ((unused)), +void +PREFIX (failed_images) (gfc_descriptor_t *array, int team __attribute__ ((unused)), int kind __attribute__ ((unused))) { - int *mem; - *num_failed_images = n_failed_imgs; - mem = (int *)calloc(n_failed_imgs,sizeof(int)); + int *mem = (int *)calloc(n_failed_imgs,sizeof(int)); + array->base_addr = mem; memcpy(mem,ranks_gc,n_failed_imgs*sizeof(int)); - return mem; + array->dtype = 265; + array->dim[0].lower_bound = 1; + array->dim[0]._ubound = n_failed_imgs; + array->dim[0]._stride = 1; + array->offset = -1; } From aa4a61ab146bf89202538247dc19b8c00a351bdf Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 25 May 2016 10:47:01 -0600 Subject: [PATCH 18/61] Sorting failed images --- src/mpi/mpi_caf.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index fc7dd139c..638814571 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -114,6 +114,11 @@ int *ranks_gc,*ranks_gf; //to be returned by failed images MPI_Errhandler errh,errh_w; int completed = 0,tmp_lock; +static int cmpfunc (const void *a, const void *b) +{ + return ( *(int*)a - *(int*)b ); +} + static void verbose_win_errhandler(MPI_Win* win, int* err, ...) { /* printf("in win err handler\n"); */ /* used_comm++; */ @@ -278,13 +283,6 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); - /* if(error_called == 1) */ - /* { */ - /* /\* MPIX_Comm_agree( CAF_COMM_WORLD, &completed ); *\/ */ - /* communicator_shrink(&CAF_COMM_WORLD); */ - /* error_called = 0; */ - /* } */ - if(error_called == 1) { /* communicator_shrink(&lock_comm); */ @@ -318,7 +316,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { - //communicator_shrink(&lock_comm); + /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } @@ -2728,6 +2726,7 @@ PREFIX (failed_images) (gfc_descriptor_t *array, int team __attribute__ ((unused int *mem = (int *)calloc(n_failed_imgs,sizeof(int)); array->base_addr = mem; memcpy(mem,ranks_gc,n_failed_imgs*sizeof(int)); + qsort(mem,n_failed_imgs,sizeof(int),cmpfunc); array->dtype = 265; array->dim[0].lower_bound = 1; array->dim[0]._ubound = n_failed_imgs; From c4841e8807999852024157b8766c6aab8ef11f36 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 25 May 2016 13:53:45 -0600 Subject: [PATCH 19/61] Managing stat for sync all --- src/mpi/mpi_caf.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 638814571..b9a1770f2 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -285,11 +285,10 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { - /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } - + locking_atomic_op(win, &value, newval, compare, image_index, index); if(value == caf_this_image && image_index == caf_this_image) @@ -316,7 +315,6 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { - /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } @@ -648,7 +646,6 @@ void * if(error_called == 1) { - /* communicator_shrink(&lock_comm); */ communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } @@ -814,16 +811,9 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { int ierr=0; - if(error_called == 1) - { - /* communicator_shrink(&lock_comm); */ - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - } - if (unlikely (caf_is_finalized)) ierr = STAT_STOPPED_IMAGE; - else + else if(ierr != STAT_FAILED_IMAGE) { #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK) explicit_flush(); @@ -832,10 +822,19 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) ierr = 0; } + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + if (stat) *stat = ierr; - - if (ierr) + else if(ierr == STAT_FAILED_IMAGE) + error_stop (ierr); + + if (ierr != STAT_FAILED_IMAGE) { char *msg; if (caf_is_finalized) From 01b75ce676fcb94fd657f41019bad9b075549605 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 25 May 2016 14:20:27 -0600 Subject: [PATCH 20/61] Managing stat for sync all --- src/mpi/mpi_caf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index b9a1770f2..734292e82 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -275,7 +275,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, { const char msg[] = "Already locked"; #if MPI_VERSION >= 3 - int value=1, compare = 0, newval = caf_this_image, i = 1,zero=0,ret=0; + int value=0, compare = 0, newval = caf_this_image, i = 1,zero=0,ret=0; int flag, it = 0, check_failure = 100; if(stat != NULL) From 5448ec506308e44f747c010d5e9c8c38171dcb02 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 25 May 2016 14:31:52 -0600 Subject: [PATCH 21/61] Fixed bug in sync_all --- src/mpi/mpi_caf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 734292e82..424a1f55d 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -678,7 +678,8 @@ void * MPI_Win_flush(caf_this_image-1, *p); # endif // CAF_MPI_LOCK_UNLOCK free(init_array); - PREFIX(sync_all) (NULL,NULL,0); + MPI_Barrier(CAF_COMM_WORLD); + /* PREFIX(sync_all) (NULL,NULL,0); */ } caf_static_t *tmp = malloc (sizeof (caf_static_t)); @@ -754,7 +755,8 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len caf_runtime_error (msg); } - PREFIX (sync_all) (NULL, NULL, 0); + /* PREFIX (sync_all) (NULL, NULL, 0); */ + MPI_Barrier(CAF_COMM_WORLD); caf_static_t *tmp = caf_tot, *prev = caf_tot, *next=caf_tot; MPI_Win *p = *token; @@ -811,6 +813,13 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { int ierr=0; + /* if(error_called == 1) */ + /* { */ + /* communicator_shrink(&CAF_COMM_WORLD); */ + /* error_called = 0; */ + /* ierr = STAT_FAILED_IMAGE; */ + /* } */ + if (unlikely (caf_is_finalized)) ierr = STAT_STOPPED_IMAGE; else if(ierr != STAT_FAILED_IMAGE) @@ -819,7 +828,6 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) explicit_flush(); #endif MPI_Barrier(CAF_COMM_WORLD); - ierr = 0; } if(error_called == 1) @@ -833,8 +841,8 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) *stat = ierr; else if(ierr == STAT_FAILED_IMAGE) error_stop (ierr); - - if (ierr != STAT_FAILED_IMAGE) + + if (ierr != 0 && ierr != STAT_FAILED_IMAGE) { char *msg; if (caf_is_finalized) From be8d98f4330261276daabc0267e9a59466e19868 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 25 May 2016 14:34:14 -0600 Subject: [PATCH 22/61] Fixed bug in sync_all --- src/mpi/mpi_caf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 424a1f55d..33ffde0d6 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -292,8 +292,11 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, locking_atomic_op(win, &value, newval, compare, image_index, index); if(value == caf_this_image && image_index == caf_this_image) - goto stat_error; - + { + printf("Lock already taken %d\n",caf_this_image); + goto stat_error; + } + if(acquired_lock != NULL) { if(value == 0) From c12de9747126acd5ddfeeabb848515fe81044874 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 25 May 2016 15:01:30 -0600 Subject: [PATCH 23/61] Minor change --- src/mpi/mpi_caf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 33ffde0d6..40bae57d8 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -383,7 +383,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, if(value == 0) goto stat_error; - + return; stat_error: @@ -1466,7 +1466,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, void PREFIX (get) (caf_token_t token, size_t offset, int image_index, - gfc_descriptor_t *src , + gfc_descriptor_t *src, caf_vector_t *src_vector __attribute__ ((unused)), gfc_descriptor_t *dest, int src_kind, int dst_kind, bool mrt) From f7de5fd6cbbdd9b199da40424f671275c9ca3f98 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 26 May 2016 11:05:08 -0600 Subject: [PATCH 24/61] Adding stat constraint to sync images, lock, unlock --- src/mpi/mpi_caf.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 40bae57d8..a316bd5e3 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -292,10 +292,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, locking_atomic_op(win, &value, newval, compare, image_index, index); if(value == caf_this_image && image_index == caf_this_image) - { - printf("Lock already taken %d\n",caf_this_image); - goto stat_error; - } + goto stat_error; if(acquired_lock != NULL) { @@ -320,6 +317,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, { communicator_shrink(&CAF_COMM_WORLD); error_called = 0; + ierr = STAT_FAILED_IMAGE; } locking_atomic_op(win, &value, newval, compare, image_index, index); @@ -342,6 +340,11 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, } } + if(stat) + *stat = ierr; + else if(ierr == STAT_FAILED_IMAGE) + error_stop (ierr); + return; stat_error: @@ -369,7 +372,14 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, #if MPI_VERSION >= 3 int value=1, compare = 1, newval = 0; - /* locking_atomic_op(win, &value, newval, compare, image_index, index); */ + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_lock (MPI_LOCK_EXCLUSIVE, image_index-1, 0, win); @@ -383,6 +393,11 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, if(value == 0) goto stat_error; + + if(stat) + *stat = ierr; + else if(ierr == STAT_FAILED_IMAGE) + error_stop (ierr); return; @@ -1852,12 +1867,21 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg, for(i=0; i < count; i++) ierr = MPI_Wait(&handlers[images[i]-1], &s); + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + memset(arrived, 0, sizeof(int)*caf_num_images); } if (stat) *stat = ierr; + else if(ierr == STAT_FAILED_IMAGE) + error_stop (ierr); sync_images_err_chk: From 32e4cb4196bb64c4935912f73a46a0cd521e1559 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 26 May 2016 11:10:43 -0600 Subject: [PATCH 25/61] Adding stat constraint to event post and event wait --- src/mpi/mpi_caf.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index a316bd5e3..eb07578d4 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -2555,6 +2555,19 @@ PREFIX (event_post) (caf_token_t token, size_t index, #warning Events for MPI-2 are not implemented printf ("Events for MPI-2 are not supported, please update your MPI implementation\n"); #endif // MPI_VERSION + + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + + if(!stat && ierr == STAT_FAILED_IMAGE) + error_stop (ierr); + if(ierr != MPI_SUCCESS) { if(stat != NULL) @@ -2615,6 +2628,19 @@ PREFIX (event_wait) (caf_token_t token, size_t index, # else // CAF_MPI_LOCK_UNLOCK MPI_Win_flush (image, *p); # endif // CAF_MPI_LOCK_UNLOCK + + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + + if(!stat && ierr == STAT_FAILED_IMAGE) + error_stop (ierr); + if(ierr != MPI_SUCCESS) { if(stat != NULL) From c1190e83349e637bdb279e0e338f3c51fa6d7052 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 26 May 2016 11:17:42 -0600 Subject: [PATCH 26/61] Fixed few bugs on --- src/mpi/mpi_caf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index eb07578d4..22292592a 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -276,7 +276,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, const char msg[] = "Already locked"; #if MPI_VERSION >= 3 int value=0, compare = 0, newval = caf_this_image, i = 1,zero=0,ret=0; - int flag, it = 0, check_failure = 100; + int flag, it = 0, check_failure = 100,ierr=0; if(stat != NULL) *stat = 0; @@ -370,7 +370,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, if(stat != NULL) *stat = 0; #if MPI_VERSION >= 3 - int value=1, compare = 1, newval = 0; + int value=1, compare = 1, newval = 0,flag,ierr=0; MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); @@ -2529,7 +2529,7 @@ PREFIX (event_post) (caf_token_t token, size_t index, int image_index, int *stat, char *errmsg, int errmsg_len) { - int image, value=1, ierr=0; + int image, value=1, ierr=0,flag; MPI_Win *p = token; const char msg[] = "Error on event post"; From 881cc26e8863eb41998fdaf7d3c7137210240125 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 26 May 2016 14:34:56 -0600 Subject: [PATCH 27/61] Adding stat constraint to allocate --- src/mpi/mpi_caf.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 22292592a..31f04dce9 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -700,6 +700,13 @@ void * /* PREFIX(sync_all) (NULL,NULL,0); */ } + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + caf_static_t *tmp = malloc (sizeof (caf_static_t)); tmp->prev = caf_tot; tmp->token = *token; @@ -715,6 +722,8 @@ void * if (stat) *stat = 0; + else if(ierr == STAT_FAILED_IMAGE) + error_stop (ierr); return mem; From 02ffc818bf014197fd7f44d8fdcdd9f78dc254c1 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 26 May 2016 14:43:05 -0600 Subject: [PATCH 28/61] Fixed few bugs on --- src/mpi/mpi_caf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 31f04dce9..405cd2251 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -633,7 +633,7 @@ void * /* int ierr; */ void *mem; size_t actual_size; - int l_var=0, *init_array=NULL; + int l_var=0, *init_array=NULL,ierr=0; if (unlikely (caf_is_finalized)) goto error; @@ -689,7 +689,7 @@ void * MPI_Win_lock(MPI_LOCK_EXCLUSIVE, caf_this_image-1, 0, *p); # endif // CAF_MPI_LOCK_UNLOCK MPI_Put (init_array, size, MPI_INT, caf_this_image-1, - 0, size, MPI_INT, *p); + 0, size, MPI_INT, *p); # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_unlock(caf_this_image-1, *p); # else // CAF_MPI_LOCK_UNLOCK @@ -706,7 +706,7 @@ void * error_called = 0; ierr = STAT_FAILED_IMAGE; } - + caf_static_t *tmp = malloc (sizeof (caf_static_t)); tmp->prev = caf_tot; tmp->token = *token; @@ -722,7 +722,7 @@ void * if (stat) *stat = 0; - else if(ierr == STAT_FAILED_IMAGE) + else if (ierr == STAT_FAILED_IMAGE) error_stop (ierr); return mem; From f6074830fd5f2b998166ac72dc31f4b4816d0e2d Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 27 May 2016 10:08:52 -0600 Subject: [PATCH 29/61] Fixed bug in register --- src/mpi/mpi_caf.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 405cd2251..cdd454f53 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -107,8 +107,8 @@ char err_buffer[MPI_MAX_ERROR_STRING]; MPI_Comm CAF_COMM_WORLD; /* Failed Images */ -MPI_Comm lock_comm; -MPI_Request lock_req; +MPI_Comm lock_comm,stopped_comm; +MPI_Request lock_req,stopped_req; int used_comm = -1, n_failed_imgs=0, error_called=0; int *ranks_gc,*ranks_gf; //to be returned by failed images MPI_Errhandler errh,errh_w; @@ -493,6 +493,10 @@ PREFIX (init) (int *argc, char ***argv) MPI_Comm_set_errhandler(lock_comm, errh); MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,lock_comm,&lock_req); + MPI_Comm_dup(CAF_COMM_WORLD, &stopped_comm); + MPI_Comm_set_errhandler(stopped_comm, errh); + MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,lock_comm,&stopped_req); + MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w); ranks_gf = (int*)malloc(caf_num_images * sizeof(int)); @@ -525,15 +529,14 @@ _gfortran_caf_finalize (void) PREFIX (finalize) (void) #endif { + int flag = 0; *img_status = STAT_STOPPED_IMAGE; /* GFC_STAT_STOPPED_IMAGE = 6000 */ MPI_Win_sync(*stat_tok); completed = 1; + MPIX_Comm_revoke(CAF_COMM_WORLD); + MPI_Barrier(stopped_comm); - MPI_Cancel(&lock_req); - MPI_Request_free(&lock_req); - MPI_Barrier(CAF_COMM_WORLD); - while (caf_static_list != NULL) { caf_static_t *tmp = caf_static_list->prev; @@ -600,7 +603,7 @@ int communicator_shrink(MPI_Comm *comm) MPIX_Comm_shrink(*comm, &shrunk); MPI_Comm_set_errhandler( shrunk, errh ); MPI_Comm_size(shrunk, &ns); MPI_Comm_rank(shrunk, &srank); - + MPI_Comm_rank(*comm, &crank); /* Split does the magic: removing spare processes and reordering ranks @@ -611,6 +614,7 @@ int communicator_shrink(MPI_Comm *comm) * new failures have disrupted the process: we need to * make sure we succeeded at all ranks, or retry until it works. */ flag = MPIX_Comm_agree(shrunk, &flag); + MPI_Comm_free(&shrunk); if( MPI_SUCCESS != flag ) { if( MPI_SUCCESS == rc ) MPI_Comm_free(*newcomm); @@ -633,7 +637,7 @@ void * /* int ierr; */ void *mem; size_t actual_size; - int l_var=0, *init_array=NULL,ierr=0; + int l_var=0, *init_array=NULL,ierr=0,flag=0; if (unlikely (caf_is_finalized)) goto error; @@ -662,6 +666,8 @@ void * else actual_size = size; + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + if(error_called == 1) { communicator_shrink(&CAF_COMM_WORLD); From 4fb6da1dbf2e3a28b97a3f6463bbc249812c9eac Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 27 May 2016 10:59:26 -0600 Subject: [PATCH 30/61] Using MPI_Comm_revoke --- src/mpi/mpi_caf.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index cdd454f53..16a0caab7 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -534,8 +534,6 @@ PREFIX (finalize) (void) MPI_Win_sync(*stat_tok); completed = 1; - MPIX_Comm_revoke(CAF_COMM_WORLD); - MPI_Barrier(stopped_comm); while (caf_static_list != NULL) { @@ -563,8 +561,16 @@ PREFIX (finalize) (void) MPI_Info_free (&mpi_info_same_size); #endif // MPI_VERSION - MPI_Comm_free(&CAF_COMM_WORLD); + //MPI_Comm_free(&CAF_COMM_WORLD); + + printf("Before revoke\n"); + MPIX_Comm_revoke(CAF_COMM_WORLD); + printf("After revoke\n"); + MPI_Test(&stopped_req,&flag,MPI_STATUS_IGNORE); + communicator_shrink(&stopped_comm); + MPI_Barrier(stopped_comm); + printf("After barrier\n"); /* Only call Finalize if CAF runtime Initialized MPI. */ if (caf_owns_mpi) { MPI_Finalize(); @@ -844,7 +850,7 @@ PREFIX (sync_memory) (int *stat, char *errmsg, int errmsg_len) void PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { - int ierr=0; + int ierr=0,flag=0; /* if(error_called == 1) */ /* { */ From d66b185109d5aff0897cb249ac8dca6f13ac952b Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 1 Jun 2016 15:07:10 -0600 Subject: [PATCH 31/61] Draft stopped images --- src/mpi/mpi_caf.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 16a0caab7..dc50bf4dd 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -113,6 +113,7 @@ int used_comm = -1, n_failed_imgs=0, error_called=0; int *ranks_gc,*ranks_gf; //to be returned by failed images MPI_Errhandler errh,errh_w; int completed = 0,tmp_lock; +int *stopped_images; static int cmpfunc (const void *a, const void *b) { @@ -501,7 +502,8 @@ PREFIX (init) (int *argc, char ***argv) ranks_gf = (int*)malloc(caf_num_images * sizeof(int)); ranks_gc = (int*)malloc(caf_num_images * sizeof(int)); - + stopped_images = (int*)calloc(caf_num_images, sizeof(int)); + #if MPI_VERSION >= 3 MPI_Info_create (&mpi_info_same_size); MPI_Info_set (mpi_info_same_size, "same_size", "true"); @@ -533,7 +535,10 @@ PREFIX (finalize) (void) *img_status = STAT_STOPPED_IMAGE; /* GFC_STAT_STOPPED_IMAGE = 6000 */ MPI_Win_sync(*stat_tok); - completed = 1; + MPIX_Comm_revoke(CAF_COMM_WORLD); + communicator_shrink(&CAF_COMM_WORLD); + + MPI_Barrier(stopped_comm); while (caf_static_list != NULL) { @@ -561,16 +566,8 @@ PREFIX (finalize) (void) MPI_Info_free (&mpi_info_same_size); #endif // MPI_VERSION - //MPI_Comm_free(&CAF_COMM_WORLD); - - printf("Before revoke\n"); + /* MPI_Comm_free(&CAF_COMM_WORLD); */ - MPIX_Comm_revoke(CAF_COMM_WORLD); - printf("After revoke\n"); - MPI_Test(&stopped_req,&flag,MPI_STATUS_IGNORE); - communicator_shrink(&stopped_comm); - MPI_Barrier(stopped_comm); - printf("After barrier\n"); /* Only call Finalize if CAF runtime Initialized MPI. */ if (caf_owns_mpi) { MPI_Finalize(); @@ -578,7 +575,7 @@ PREFIX (finalize) (void) pthread_mutex_lock(&lock_am); caf_is_finalized = 1; pthread_mutex_unlock(&lock_am); - printf("finalizing\n"); + /* printf("finalizing\n"); */ exit(0); } @@ -614,6 +611,8 @@ int communicator_shrink(MPI_Comm *comm) /* Split does the magic: removing spare processes and reordering ranks * so that all surviving processes remain at their former place */ + if (*img_status == STAT_STOPPED_IMAGE) + crank = -1; rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, newcomm); /* Split or some of the communications above may have failed if @@ -708,10 +707,11 @@ void * MPI_Win_flush(caf_this_image-1, *p); # endif // CAF_MPI_LOCK_UNLOCK free(init_array); - MPI_Barrier(CAF_COMM_WORLD); /* PREFIX(sync_all) (NULL,NULL,0); */ } + MPI_Barrier(CAF_COMM_WORLD); + if(error_called == 1) { communicator_shrink(&CAF_COMM_WORLD); From c2373cfa223f2f4e828308db3fa1b13432c7f642 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 3 Jun 2016 11:15:48 -0600 Subject: [PATCH 32/61] Partial version for stopped images --- src/libcaf.h | 1 + src/mpi/mpi_caf.c | 44 ++++++++++++++++++++++---------------------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/src/libcaf.h b/src/libcaf.h index d7174874e..dafaaaa45 100644 --- a/src/libcaf.h +++ b/src/libcaf.h @@ -80,6 +80,7 @@ typedef void* caf_token_t; /* Linked list of static coarrays registered. */ typedef struct caf_static_t { caf_token_t token; + caf_token_t stopped_token; struct caf_static_t *prev; } caf_static_t; diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index dc50bf4dd..ad27ef4d5 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -132,8 +132,8 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ MPI_Group group_c, group_f; comm = *pcomm; - memset(ranks_gf,0,sizeof(int)*caf_num_images); - memset(ranks_gc,0,sizeof(int)*caf_num_images); + /* memset(ranks_gf,0,sizeof(int)*caf_num_images); */ + /* memset(ranks_gc,0,sizeof(int)*caf_num_images); */ MPIX_Comm_failure_ack(comm); MPIX_Comm_failure_get_acked(comm, &group_f); @@ -144,15 +144,10 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ MPI_Group_translate_ranks(group_f, nf, ranks_gf, group_c, ranks_gc); for(i = 0; i < nf; i++) - { - ranks_gc[i]++; - /* printf("me: %d - ranks failed %d\n",caf_this_image,ranks_gc[i]); */ - } + ranks_gc[i]++; - n_failed_imgs = nf; + n_failed_imgs += nf; error_called = 1; - /* used_comm++; */ - /* CAF_COMM_WORLD = communicators[used_comm]; */ } /* For MPI interoperability, allow external initialization @@ -500,21 +495,21 @@ PREFIX (init) (int *argc, char ***argv) MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w); - ranks_gf = (int*)malloc(caf_num_images * sizeof(int)); - ranks_gc = (int*)malloc(caf_num_images * sizeof(int)); + ranks_gf = (int*)calloc(caf_num_images,sizeof(int)); + ranks_gc = (int*)calloc(caf_num_images,sizeof(int)); stopped_images = (int*)calloc(caf_num_images, sizeof(int)); #if MPI_VERSION >= 3 MPI_Info_create (&mpi_info_same_size); MPI_Info_set (mpi_info_same_size, "same_size", "true"); /* Setting img_status */ - MPI_Win_allocate(sizeof(int), 1, mpi_info_same_size, CAF_COMM_WORLD, &img_status, stat_tok); + MPI_Win_allocate(sizeof(int), 1, mpi_info_same_size, stopped_comm, &img_status, stat_tok); # ifndef CAF_MPI_LOCK_UNLOCK MPI_Win_lock_all(MPI_MODE_NOCHECK, *stat_tok); # endif // CAF_MPI_LOCK_UNLOCK #else MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &img_status, stat_tok); - MPI_Win_create(img_status, sizeof(int), 1, MPI_INFO_NULL, CAF_COMM_WORLD, stat_tok); + MPI_Win_create(img_status, sizeof(int), 1, MPI_INFO_NULL, stopped_comm, stat_tok); #endif // MPI_VERSION *img_status = 0; MPI_Win_set_errhandler(*stat_tok,errh_w); @@ -539,7 +534,7 @@ PREFIX (finalize) (void) communicator_shrink(&CAF_COMM_WORLD); MPI_Barrier(stopped_comm); - + while (caf_static_list != NULL) { caf_static_t *tmp = caf_static_list->prev; @@ -643,6 +638,7 @@ void * void *mem; size_t actual_size; int l_var=0, *init_array=NULL,ierr=0,flag=0; + MPI_Win *stopped_win; if (unlikely (caf_is_finalized)) goto error; @@ -658,6 +654,7 @@ void * /* Token contains only a list of pointers. */ *token = malloc (sizeof(MPI_Win)); + stopped_win = (MPI_Win *)malloc(sizeof(MPI_Win)); MPI_Win *p = *token; @@ -671,7 +668,7 @@ void * else actual_size = size; - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + MPI_Barrier(CAF_COMM_WORLD); if(error_called == 1) { @@ -710,18 +707,21 @@ void * /* PREFIX(sync_all) (NULL,NULL,0); */ } - MPI_Barrier(CAF_COMM_WORLD); + /* MPI_Barrier(CAF_COMM_WORLD); */ - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } + /* if(error_called == 1) */ + /* { */ + /* communicator_shrink(&CAF_COMM_WORLD); */ + /* error_called = 0; */ + /* ierr = STAT_FAILED_IMAGE; */ + /* } */ + /* MPI_Win_create_dynamic(MPI_INFO_NULL, stopped_comm, stopped_win); */ + caf_static_t *tmp = malloc (sizeof (caf_static_t)); tmp->prev = caf_tot; tmp->token = *token; + tmp->stopped_token = stopped_win; caf_tot = tmp; if (type == CAF_REGTYPE_COARRAY_STATIC) From f0a24c86851b124de875a9e8a000a228cfdf9a42 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 6 Jun 2016 14:24:19 -0600 Subject: [PATCH 33/61] Fixed bug in failed_images --- src/mpi/mpi_caf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index ad27ef4d5..7af520bfd 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -2809,8 +2809,8 @@ PREFIX (failed_images) (gfc_descriptor_t *array, int team __attribute__ ((unused memcpy(mem,ranks_gc,n_failed_imgs*sizeof(int)); qsort(mem,n_failed_imgs,sizeof(int),cmpfunc); array->dtype = 265; - array->dim[0].lower_bound = 1; - array->dim[0]._ubound = n_failed_imgs; + array->dim[0].lower_bound = 0; + array->dim[0]._ubound = n_failed_imgs-1; array->dim[0]._stride = 1; array->offset = -1; } From ae6ed63e7e61a5361b151a305425264dd1c19aed Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 10 Jun 2016 10:23:22 -0600 Subject: [PATCH 34/61] Get and put stat variable --- src/mpi/mpi_caf.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 7af520bfd..f94913d8a 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -1118,7 +1118,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, gfc_descriptor_t *dest, caf_vector_t *dst_vector __attribute__ ((unused)), gfc_descriptor_t *src, int dst_kind, int src_kind, - bool mrt) + int *stat, bool mrt) { /* FIXME: Implement vector subscripts, type conversion and check whether string-kind conversions are permitted. @@ -1505,11 +1505,10 @@ PREFIX (get) (caf_token_t token, size_t offset, gfc_descriptor_t *src, caf_vector_t *src_vector __attribute__ ((unused)), gfc_descriptor_t *dest, int src_kind, int dst_kind, - bool mrt) + int *stat, bool mrt) { size_t i, size; - int ierr = 0; - int j; + int ierr = 0, j, flag; MPI_Win *p = token; int rank = GFC_DESCRIPTOR_RANK (src); size_t src_size = GFC_DESCRIPTOR_SIZE (src); @@ -1572,9 +1571,18 @@ PREFIX (get) (caf_token_t token, size_t offset, # else // CAF_MPI_LOCK_UNLOCK MPI_Win_flush (image_index-1, *p); # endif // CAF_MPI_LOCK_UNLOCK + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + + if(!stat && ierr == STAT_FAILED_IMAGE) + error_stop (ierr); } - if (ierr != 0) - error_stop (ierr); return; } @@ -1663,6 +1671,22 @@ PREFIX (get) (caf_token_t token, size_t offset, //sr_off = offset; + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + printf("In error_called\n"); + } + + if(!stat && ierr == STAT_FAILED_IMAGE) + { + printf("error\n"); + error_stop (ierr); + } + # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_lock (MPI_LOCK_SHARED, image_index-1, 0, *p); # endif // CAF_MPI_LOCK_UNLOCK @@ -1675,8 +1699,8 @@ PREFIX (get) (caf_token_t token, size_t offset, MPI_Win_flush (image_index-1, *p); # endif // CAF_MPI_LOCK_UNLOCK - if (ierr != 0) - error_stop (ierr); + /* if (ierr != 0) */ + /* error_stop (ierr); */ MPI_Type_free(&dt_s); MPI_Type_free(&dt_d); From 5ae617c6738088ea6eafc97f8fa918aab8abbf0f Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 10 Jun 2016 13:38:43 -0600 Subject: [PATCH 35/61] Adapting prototypes to the new stat= attribute --- src/libcaf.h | 4 ++-- src/mpi/mpi_caf.c | 28 ++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/libcaf.h b/src/libcaf.h index dafaaaa45..9a0a3144f 100644 --- a/src/libcaf.h +++ b/src/libcaf.h @@ -122,9 +122,9 @@ void *PREFIX (register) (size_t, caf_register_t, caf_token_t *, int *, char *, void PREFIX (deregister) (caf_token_t *, int *, char *, int); void PREFIX (caf_get) (caf_token_t, size_t, int, gfc_descriptor_t *, - caf_vector_t *, gfc_descriptor_t *, int, int); + caf_vector_t *, gfc_descriptor_t *, int, int *, int); void PREFIX (caf_send) (caf_token_t, size_t, int, gfc_descriptor_t *, - caf_vector_t *, gfc_descriptor_t *, int, int); + caf_vector_t *, gfc_descriptor_t *, int, int *, int); void PREFIX (caf_sendget) (caf_token_t, size_t, int, gfc_descriptor_t *, caf_vector_t *, caf_token_t, size_t, int, diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index f94913d8a..2210eba20 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -1123,7 +1123,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, /* FIXME: Implement vector subscripts, type conversion and check whether string-kind conversions are permitted. FIXME: Implement sendget as well. */ - int ierr = 0; + int ierr = 0, flag = 0; size_t i, size; int j; /* int position, msg = 0; */ @@ -1214,8 +1214,21 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, #endif // CAF_MPI_LOCK_UNLOCK } - if (ierr != 0) - error_stop (ierr); + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + + if(!stat && ierr == STAT_FAILED_IMAGE) + error_stop (ierr); + + if(stat) + *stat = ierr; + return; } else @@ -1582,6 +1595,9 @@ PREFIX (get) (caf_token_t token, size_t offset, if(!stat && ierr == STAT_FAILED_IMAGE) error_stop (ierr); + + if(stat) + *stat = ierr; } return; } @@ -1678,14 +1694,10 @@ PREFIX (get) (caf_token_t token, size_t offset, communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; - printf("In error_called\n"); } if(!stat && ierr == STAT_FAILED_IMAGE) - { - printf("error\n"); - error_stop (ierr); - } + error_stop (ierr); # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_lock (MPI_LOCK_SHARED, image_index-1, 0, *p); From f244aa140bccb594602f30e8e64c5a920b444a52 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 10 Jun 2016 14:31:59 -0600 Subject: [PATCH 36/61] Fixed bug on number of failed images --- src/mpi/mpi_caf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 2210eba20..a6bfe41bf 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -131,9 +131,6 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ int nf,nc,i; MPI_Group group_c, group_f; comm = *pcomm; - - /* memset(ranks_gf,0,sizeof(int)*caf_num_images); */ - /* memset(ranks_gc,0,sizeof(int)*caf_num_images); */ MPIX_Comm_failure_ack(comm); MPIX_Comm_failure_get_acked(comm, &group_f); @@ -146,7 +143,9 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ for(i = 0; i < nf; i++) ranks_gc[i]++; - n_failed_imgs += nf; + printf("numero failed images %d\n",n_failed_imgs); + + n_failed_imgs = nf; error_called = 1; } From be7a1a64a0ff8f7d577877e3ed53d58b105d6af9 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 10 Jun 2016 14:43:39 -0600 Subject: [PATCH 37/61] Cleanup post bug-fix --- src/mpi/mpi_caf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index a6bfe41bf..00cffcce2 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -143,8 +143,6 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ for(i = 0; i < nf; i++) ranks_gc[i]++; - printf("numero failed images %d\n",n_failed_imgs); - n_failed_imgs = nf; error_called = 1; } From cba6ef5eab5bdf1f39b9a3d63f3b9588cf7fb5ae Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Fri, 17 Jun 2016 08:42:57 -0600 Subject: [PATCH 38/61] stat attribute in last position for get and put for compatibility --- src/mpi/mpi_caf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 00cffcce2..33239fe5e 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -1115,7 +1115,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, gfc_descriptor_t *dest, caf_vector_t *dst_vector __attribute__ ((unused)), gfc_descriptor_t *src, int dst_kind, int src_kind, - int *stat, bool mrt) + bool mrt, int *stat) { /* FIXME: Implement vector subscripts, type conversion and check whether string-kind conversions are permitted. @@ -1515,7 +1515,7 @@ PREFIX (get) (caf_token_t token, size_t offset, gfc_descriptor_t *src, caf_vector_t *src_vector __attribute__ ((unused)), gfc_descriptor_t *dest, int src_kind, int dst_kind, - int *stat, bool mrt) + bool mrt, int *stat) { size_t i, size; int ierr = 0, j, flag; From 59ce1e52bfebf36e71d20a730b771e13bd9a9aef Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Tue, 4 Oct 2016 10:44:12 -0600 Subject: [PATCH 39/61] General improvement after tutorial --- src/mpi/mpi_caf.c | 165 +++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 91 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 33239fe5e..3e610cc63 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -278,6 +278,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { + MPIX_Comm_revoke(CAF_COMM_WORLD); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; } @@ -308,6 +309,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { + MPIX_Comm_revoke(CAF_COMM_WORLD); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; @@ -369,6 +371,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, if(error_called == 1) { + MPIX_Comm_revoke(CAF_COMM_WORLD); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; @@ -592,9 +595,8 @@ int communicator_shrink(MPI_Comm *comm) MPI_Comm shrunk, *newcomm; MPI_Group cgrp, sgrp, dgrp; - redo: newcomm = (MPI_Comm *)calloc(1,sizeof(MPI_Comm)); - + redo: MPIX_Comm_shrink(*comm, &shrunk); MPI_Comm_set_errhandler( shrunk, errh ); MPI_Comm_size(shrunk, &ns); MPI_Comm_rank(shrunk, &srank); @@ -606,7 +608,7 @@ int communicator_shrink(MPI_Comm *comm) if (*img_status == STAT_STOPPED_IMAGE) crank = -1; rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, newcomm); - + flag = (rc == MPI_SUCCESS); /* Split or some of the communications above may have failed if * new failures have disrupted the process: we need to * make sure we succeeded at all ranks, or retry until it works. */ @@ -687,6 +689,12 @@ void * MPI_Win_set_errhandler(*p,errh_w); + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + } + if(l_var) { init_array = (int *)calloc(size, sizeof(int)); @@ -704,17 +712,13 @@ void * /* PREFIX(sync_all) (NULL,NULL,0); */ } - /* MPI_Barrier(CAF_COMM_WORLD); */ - - /* if(error_called == 1) */ - /* { */ - /* communicator_shrink(&CAF_COMM_WORLD); */ - /* error_called = 0; */ - /* ierr = STAT_FAILED_IMAGE; */ - /* } */ + if(error_called == 1) + { + MPIX_Comm_revoke(CAF_COMM_WORLD); + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + } - /* MPI_Win_create_dynamic(MPI_INFO_NULL, stopped_comm, stopped_win); */ - caf_static_t *tmp = malloc (sizeof (caf_static_t)); tmp->prev = caf_tot; tmp->token = *token; @@ -794,6 +798,13 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len /* PREFIX (sync_all) (NULL, NULL, 0); */ MPI_Barrier(CAF_COMM_WORLD); + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + MPI_Barrier(CAF_COMM_WORLD); + } + caf_static_t *tmp = caf_tot, *prev = caf_tot, *next=caf_tot; MPI_Win *p = *token; @@ -828,10 +839,6 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len if (stat) *stat = 0; - /* if (unlikely (ierr = ARMCI_Free ((*token)[caf_this_image-1]))) */ - /* caf_runtime_error ("ARMCI memory freeing failed: Error code %d", ierr); */ - //gasnet_exit(0); - free (*token); } @@ -848,13 +855,6 @@ void PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { int ierr=0,flag=0; - - /* if(error_called == 1) */ - /* { */ - /* communicator_shrink(&CAF_COMM_WORLD); */ - /* error_called = 0; */ - /* ierr = STAT_FAILED_IMAGE; */ - /* } */ if (unlikely (caf_is_finalized)) ierr = STAT_STOPPED_IMAGE; @@ -871,6 +871,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; + MPI_Barrier(CAF_COMM_WORLD); } if (stat) @@ -944,6 +945,7 @@ void selectType(int size, MPI_Datatype *dt) } +/* Not yet adapted for failed images */ void PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s, gfc_descriptor_t *dest, @@ -1326,65 +1328,29 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, MPI_Win_flush (image_index-1, *p); # endif // CAF_MPI_LOCK_UNLOCK - if (ierr != 0) - { - error_stop (ierr); - return; - } - - MPI_Type_free (&dt_s); - MPI_Type_free (&dt_d); - - /* msg = 2; */ - /* MPI_Pack(&msg, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */ - /* MPI_Pack(&rank, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */ - - /* for(j=0;jdim[j]._stride), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */ - /* MPI_Pack(&(dest->dim[j].lower_bound), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */ - /* MPI_Pack(&(dest->dim[j]._ubound), 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */ - /* } */ - - /* MPI_Pack(&size, 1, MPI_INT, buff_am[caf_this_image], 1000, &position, CAF_COMM_WORLD); */ - - /* /\* non-blocking send *\/ */ - - /* MPI_Issend(buff_am[caf_this_image], position, MPI_PACKED, image_index-1, 1, CAF_COMM_WORLD, &reqdt); */ - - /* msgbody = calloc(size, sizeof(char)); */ + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + + if(!stat && ierr == STAT_FAILED_IMAGE) + error_stop (ierr); - /* ptrdiff_t array_offset_sr = 0; */ - /* ptrdiff_t stride = 1; */ - /* ptrdiff_t extent = 1; */ + if(stat) + *stat = ierr; - /* for(i = 0; i < size; i++) */ + /* if (ierr != 0) */ /* { */ - /* for (j = 0; j < GFC_DESCRIPTOR_RANK (src)-1; j++) */ - /* { */ - /* array_offset_sr += ((i / (extent*stride)) */ - /* % (src->dim[j]._ubound */ - /* - src->dim[j].lower_bound + 1)) */ - /* * src->dim[j]._stride; */ - /* extent = (src->dim[j]._ubound - src->dim[j].lower_bound + 1); */ - /* stride = src->dim[j]._stride; */ - /* } */ - - /* array_offset_sr += (i / extent) * src->dim[rank-1]._stride; */ - - /* void *sr = (void *)((char *) src->base_addr */ - /* + array_offset_sr*GFC_DESCRIPTOR_SIZE (src)); */ - - /* memmove (msgbody+p_mb, sr, GFC_DESCRIPTOR_SIZE (src)); */ - - /* p_mb += GFC_DESCRIPTOR_SIZE (src); */ + /* error_stop (ierr); */ + /* return; */ /* } */ - /* MPI_Wait(&reqdt, &stadt); */ - - /* MPI_Ssend(msgbody, size, MPI_BYTE, image_index-1, 1, CAF_COMM_WORLD); */ - - /* free(msgbody); */ + MPI_Type_free (&dt_s); + MPI_Type_free (&dt_d); #else if(caf_this_image == image_index && mrt) @@ -1459,12 +1425,12 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, ierr = MPI_Put (pad_str, dst_size - src_size, MPI_BYTE, image_index-1, dst_offset, dst_size - src_size, MPI_BYTE, *p); } - - if (ierr != 0) - { - error_stop (ierr); - return; - } + + /* if (ierr != 0) */ + /* { */ + /* error_stop (ierr); */ + /* return; */ + /* } */ } if(caf_this_image == image_index && mrt) @@ -1503,6 +1469,20 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, MPI_Win_flush (image_index-1, *p); # endif // CAF_MPI_LOCK_UNLOCK #endif + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + + if(error_called == 1) + { + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + } + + if(!stat && ierr == STAT_FAILED_IMAGE) + error_stop (ierr); + + if(stat) + *stat = ierr; } } @@ -1684,23 +1664,26 @@ PREFIX (get) (caf_token_t token, size_t offset, //sr_off = offset; - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); +# ifdef CAF_MPI_LOCK_UNLOCK + MPI_Win_lock (MPI_LOCK_SHARED, image_index-1, 0, *p); +# endif // CAF_MPI_LOCK_UNLOCK + ierr = MPI_Get (dst, 1, dt_d, image_index-1, offset, 1, dt_s, *p); + + MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + if(error_called == 1) { communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; } - + if(!stat && ierr == STAT_FAILED_IMAGE) error_stop (ierr); -# ifdef CAF_MPI_LOCK_UNLOCK - MPI_Win_lock (MPI_LOCK_SHARED, image_index-1, 0, *p); -# endif // CAF_MPI_LOCK_UNLOCK - - ierr = MPI_Get (dst, 1, dt_d, image_index-1, offset, 1, dt_s, *p); + if(stat) + *stat = ierr; # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_unlock (image_index-1, *p); From dc64a9797a257e869ac8bccf62da79deeda64e96 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 5 Oct 2016 09:39:50 -0600 Subject: [PATCH 40/61] Fix small bug about number of failed images --- src/mpi/mpi_caf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 3e610cc63..20f1d118f 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -143,7 +143,7 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ for(i = 0; i < nf; i++) ranks_gc[i]++; - n_failed_imgs = nf; + n_failed_imgs += nf; error_called = 1; } From f4d3ce22d1450dd462776611f877fa21fa451dc6 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 5 Oct 2016 14:01:14 -0600 Subject: [PATCH 41/61] Fixed bug in locking --- src/mpi/mpi_caf.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 20f1d118f..26aba5ff3 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -109,9 +109,10 @@ MPI_Comm CAF_COMM_WORLD; /* Failed Images */ MPI_Comm lock_comm,stopped_comm; MPI_Request lock_req,stopped_req; -int used_comm = -1, n_failed_imgs=0, error_called=0; +int used_comm = -1, n_failed_imgs=0; +int error_called = 0, fake_error_called = 0; int *ranks_gc,*ranks_gf; //to be returned by failed images -MPI_Errhandler errh,errh_w; +MPI_Errhandler errh,errh_w,errh_fake; int completed = 0,tmp_lock; int *stopped_images; @@ -280,6 +281,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, { MPIX_Comm_revoke(CAF_COMM_WORLD); communicator_shrink(&CAF_COMM_WORLD); + communicator_shrink(&lock_comm); error_called = 0; } @@ -311,6 +313,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, { MPIX_Comm_revoke(CAF_COMM_WORLD); communicator_shrink(&CAF_COMM_WORLD); + communicator_shrink(&lock_comm); error_called = 0; ierr = STAT_FAILED_IMAGE; } @@ -324,7 +327,8 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_lock (MPI_LOCK_EXCLUSIVE, image_index-1, 0, win); # endif // CAF_MPI_LOCK_UNLOCK - MPI_Fetch_and_op(&zero, &value, MPI_INT, image_index-1, index*sizeof(int), MPI_REPLACE, win); + /* MPI_Fetch_and_op(&zero, &newval, MPI_INT, image_index-1, index*sizeof(int), MPI_REPLACE, win); */ + MPI_Compare_and_swap(&zero,&value,&newval,MPI_INT,image_index-1,index*sizeof(int), win); # ifdef CAF_MPI_LOCK_UNLOCK MPI_Win_unlock (image_index-1, win); # else // CAF_MPI_LOCK_UNLOCK @@ -373,6 +377,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, { MPIX_Comm_revoke(CAF_COMM_WORLD); communicator_shrink(&CAF_COMM_WORLD); + communicator_shrink(&lock_comm); error_called = 0; ierr = STAT_FAILED_IMAGE; } @@ -387,8 +392,9 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, MPI_Win_flush (image_index-1, win); # endif // CAF_MPI_LOCK_UNLOCK - if(value == 0) - goto stat_error; + /* Temporarily commented */ + /* if(value == 0) */ + /* goto stat_error; */ if(stat) *stat = ierr; @@ -483,15 +489,17 @@ PREFIX (init) (int *argc, char ***argv) stat_tok = malloc (sizeof(MPI_Win)); MPI_Comm_create_errhandler(verbose_comm_errhandler, &errh); + /* MPI_Comm_create_errhandler(fake_comm_errhandler, &errh_fake); */ MPI_Comm_set_errhandler(CAF_COMM_WORLD, errh); MPI_Comm_dup(CAF_COMM_WORLD, &lock_comm); + /* MPI_Comm_set_errhandler(lock_comm, errh_fake); */ MPI_Comm_set_errhandler(lock_comm, errh); MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,lock_comm,&lock_req); MPI_Comm_dup(CAF_COMM_WORLD, &stopped_comm); MPI_Comm_set_errhandler(stopped_comm, errh); - MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,lock_comm,&stopped_req); + MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,stopped_comm,&stopped_req); MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w); @@ -1218,7 +1226,9 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, if(error_called == 1) { communicator_shrink(&CAF_COMM_WORLD); + communicator_shrink(&lock_comm); error_called = 0; + fake_error_called = 0; ierr = STAT_FAILED_IMAGE; } From 245dc1fe37fcd086aeb1a8ba59911b89644f8631 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 19 Oct 2016 09:46:14 -0600 Subject: [PATCH 42/61] Fixed num_images --- src/mpi/mpi_caf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 26aba5ff3..15e4e71b2 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -594,6 +594,7 @@ int PREFIX (num_images)(int distance __attribute__ ((unused)), int failed __attribute__ ((unused))) { + MPI_Comm_size(CAF_COMM_WORLD,&caf_num_images); return caf_num_images; } @@ -1226,7 +1227,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, if(error_called == 1) { communicator_shrink(&CAF_COMM_WORLD); - communicator_shrink(&lock_comm); + /* communicator_shrink(&lock_comm); */ error_called = 0; fake_error_called = 0; ierr = STAT_FAILED_IMAGE; From 6a1fbab09af5a815d7e8d3fe80e9808ac2fc5886 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 20 Oct 2016 08:41:50 -0600 Subject: [PATCH 43/61] Revert last change --- src/mpi/mpi_caf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 15e4e71b2..6a154002d 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -594,7 +594,6 @@ int PREFIX (num_images)(int distance __attribute__ ((unused)), int failed __attribute__ ((unused))) { - MPI_Comm_size(CAF_COMM_WORLD,&caf_num_images); return caf_num_images; } From f4411e6146db40a7d3ca27b2030ae24f46e1f6b4 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 24 Oct 2016 08:55:02 -0600 Subject: [PATCH 44/61] Failed images fixed --- src/mpi/mpi_caf.c | 60 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 6a154002d..d35f32458 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -111,7 +111,7 @@ MPI_Comm lock_comm,stopped_comm; MPI_Request lock_req,stopped_req; int used_comm = -1, n_failed_imgs=0; int error_called = 0, fake_error_called = 0; -int *ranks_gc,*ranks_gf; //to be returned by failed images +int *ranks_gc,*ranks_gf, *failed_images_array; //to be returned by failed images MPI_Errhandler errh,errh_w,errh_fake; int completed = 0,tmp_lock; int *stopped_images; @@ -129,22 +129,34 @@ static void verbose_win_errhandler(MPI_Win* win, int* err, ...) { static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ MPI_Comm comm; - int nf,nc,i; + int nf,nc,i,old_nf,j; MPI_Group group_c, group_f; comm = *pcomm; + + old_nf = n_failed_imgs; MPIX_Comm_failure_ack(comm); MPIX_Comm_failure_get_acked(comm, &group_f); MPI_Group_size(group_f, &nf); - MPI_Comm_group(comm, &group_c); + /* MPI_Comm_group(comm, &group_c); */ + MPI_Comm_group(MPI_COMM_WORLD, &group_c); for(i = 0; i < nf; i++) ranks_gf[i] = i; MPI_Group_translate_ranks(group_f, nf, ranks_gf, group_c, ranks_gc); - for(i = 0; i < nf; i++) - ranks_gc[i]++; + printf("%d in verbose old_nf:%d nf:%d\n",caf_this_image,old_nf,nf); n_failed_imgs += nf; + j=0; + + for(i = old_nf; i < n_failed_imgs; i++) + { + failed_images_array[i] = ranks_gc[j]; + printf("Ranks_gc %d\n",ranks_gc[j]); + failed_images_array[i]++; + j++; + } + error_called = 1; } @@ -322,7 +334,7 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, for(i=0;i= 3 @@ -522,7 +536,7 @@ PREFIX (init) (int *argc, char ***argv) *img_status = 0; MPI_Win_set_errhandler(*stat_tok,errh_w); } - /* MPI_Barrier(CAF_COMM_WORLD); */ + MPI_Barrier(CAF_COMM_WORLD); } /* Finalize coarray program. */ @@ -609,8 +623,9 @@ int communicator_shrink(MPI_Comm *comm) MPI_Comm_set_errhandler( shrunk, errh ); MPI_Comm_size(shrunk, &ns); MPI_Comm_rank(shrunk, &srank); - MPI_Comm_rank(*comm, &crank); - + // MPI_Comm_rank(*comm, &crank); + MPI_Comm_rank(MPI_COMM_WORLD, &crank); + printf("me: %d becomes: %d\n",caf_this_image,crank+1); /* Split does the magic: removing spare processes and reordering ranks * so that all surviving processes remain at their former place */ if (*img_status == STAT_STOPPED_IMAGE) @@ -863,6 +878,15 @@ void PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { int ierr=0,flag=0; + + if(error_called == 1) + { + printf("%d First if in sync all\n",caf_this_image); + communicator_shrink(&CAF_COMM_WORLD); + error_called = 0; + ierr = STAT_FAILED_IMAGE; + /* MPI_Barrier(CAF_COMM_WORLD); */ + } if (unlikely (caf_is_finalized)) ierr = STAT_STOPPED_IMAGE; @@ -876,6 +900,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) if(error_called == 1) { + printf("%d Second if in sync all\n",caf_this_image); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; @@ -1221,7 +1246,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, #endif // CAF_MPI_LOCK_UNLOCK } - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ if(error_called == 1) { @@ -1338,10 +1363,11 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, MPI_Win_flush (image_index-1, *p); # endif // CAF_MPI_LOCK_UNLOCK - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ if(error_called == 1) { + printf("%d In second shrink\n",caf_this_image); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; @@ -1479,7 +1505,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, MPI_Win_flush (image_index-1, *p); # endif // CAF_MPI_LOCK_UNLOCK #endif - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ if(error_called == 1) { @@ -1571,7 +1597,7 @@ PREFIX (get) (caf_token_t token, size_t offset, # else // CAF_MPI_LOCK_UNLOCK MPI_Win_flush (image_index-1, *p); # endif // CAF_MPI_LOCK_UNLOCK - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ if(error_called == 1) { @@ -1680,7 +1706,7 @@ PREFIX (get) (caf_token_t token, size_t offset, ierr = MPI_Get (dst, 1, dt_d, image_index-1, offset, 1, dt_s, *p); - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ if(error_called == 1) { @@ -2802,7 +2828,7 @@ PREFIX (image_status) (int image) int i,res=0, remote_stat=0,ierr; for(i=0;ibase_addr = mem; - memcpy(mem,ranks_gc,n_failed_imgs*sizeof(int)); + memcpy(mem,failed_images_array,n_failed_imgs*sizeof(int)); qsort(mem,n_failed_imgs,sizeof(int),cmpfunc); array->dtype = 265; - array->dim[0].lower_bound = 0; + array->dim[0].lower_bound = 1; array->dim[0]._ubound = n_failed_imgs-1; array->dim[0]._stride = 1; array->offset = -1; From 8421c8c833abfa750c766b9a945becaee596487d Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Mon, 24 Oct 2016 09:33:42 -0600 Subject: [PATCH 45/61] Cleanup --- src/mpi/mpi_caf.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index d35f32458..71298ad19 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -144,7 +144,6 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ ranks_gf[i] = i; MPI_Group_translate_ranks(group_f, nf, ranks_gf, group_c, ranks_gc); - printf("%d in verbose old_nf:%d nf:%d\n",caf_this_image,old_nf,nf); n_failed_imgs += nf; j=0; @@ -152,7 +151,6 @@ static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ for(i = old_nf; i < n_failed_imgs; i++) { failed_images_array[i] = ranks_gc[j]; - printf("Ranks_gc %d\n",ranks_gc[j]); failed_images_array[i]++; j++; } @@ -625,7 +623,6 @@ int communicator_shrink(MPI_Comm *comm) // MPI_Comm_rank(*comm, &crank); MPI_Comm_rank(MPI_COMM_WORLD, &crank); - printf("me: %d becomes: %d\n",caf_this_image,crank+1); /* Split does the magic: removing spare processes and reordering ranks * so that all surviving processes remain at their former place */ if (*img_status == STAT_STOPPED_IMAGE) @@ -881,7 +878,6 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) if(error_called == 1) { - printf("%d First if in sync all\n",caf_this_image); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; @@ -900,7 +896,6 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) if(error_called == 1) { - printf("%d Second if in sync all\n",caf_this_image); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; @@ -1367,7 +1362,6 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, if(error_called == 1) { - printf("%d In second shrink\n",caf_this_image); communicator_shrink(&CAF_COMM_WORLD); error_called = 0; ierr = STAT_FAILED_IMAGE; From c2887c7e11bcf43efcace18314780f287cc32916 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Tue, 25 Oct 2016 14:12:45 -0600 Subject: [PATCH 46/61] New stopped_images function --- src/mpi/mpi_caf.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 71298ad19..5490bc90b 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -114,7 +114,7 @@ int error_called = 0, fake_error_called = 0; int *ranks_gc,*ranks_gf, *failed_images_array; //to be returned by failed images MPI_Errhandler errh,errh_w,errh_fake; int completed = 0,tmp_lock; -int *stopped_images; +int *stopped_images, n_stopped_imgs; static int cmpfunc (const void *a, const void *b) { @@ -2860,3 +2860,18 @@ PREFIX (failed_images) (gfc_descriptor_t *array, int team __attribute__ ((unused array->dim[0]._stride = 1; array->offset = -1; } + +void +PREFIX (stopped_images) (gfc_descriptor_t *array, int team __attribute__ ((unused)), + int kind __attribute__ ((unused))) +{ + int *mem = (int *)calloc(n_stopped_imgs,sizeof(int)); + array->base_addr = mem; + memcpy(mem,stopped_images,n_stopped_imgs*sizeof(int)); + qsort(mem,n_stopped_imgs,sizeof(int),cmpfunc); + array->dtype = 265; + array->dim[0].lower_bound = 1; + array->dim[0]._ubound = n_stopped_imgs-1; + array->dim[0]._stride = 1; + array->offset = -1; +} From 0264344191bbc0a5ea1b2df094f20c3366b9cf5f Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Wed, 26 Oct 2016 15:19:14 -0600 Subject: [PATCH 47/61] Stopped images --- src/mpi/mpi_caf.c | 89 +++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 30 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 5490bc90b..e6a9db31a 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -87,7 +87,7 @@ caf_static_t *caf_tot = NULL; /* Image status variable */ -static int *img_status = NULL; +static int img_status = 0; MPI_Win *stat_tok; /* Active messages variables */ @@ -114,7 +114,7 @@ int error_called = 0, fake_error_called = 0; int *ranks_gc,*ranks_gf, *failed_images_array; //to be returned by failed images MPI_Errhandler errh,errh_w,errh_fake; int completed = 0,tmp_lock; -int *stopped_images, n_stopped_imgs; +int *stopped_imgs, n_stopped_imgs; static int cmpfunc (const void *a, const void *b) { @@ -511,27 +511,30 @@ PREFIX (init) (int *argc, char ***argv) MPI_Comm_dup(CAF_COMM_WORLD, &stopped_comm); MPI_Comm_set_errhandler(stopped_comm, errh); MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,stopped_comm,&stopped_req); - + MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w); ranks_gf = (int*)calloc(caf_num_images,sizeof(int)); ranks_gc = (int*)calloc(caf_num_images,sizeof(int)); failed_images_array = (int*)calloc(caf_num_images,sizeof(int)); - stopped_images = (int*)calloc(caf_num_images, sizeof(int)); - + /* stopped_imgs = (int*)calloc(caf_num_images, sizeof(int)); */ + #if MPI_VERSION >= 3 - MPI_Info_create (&mpi_info_same_size); - MPI_Info_set (mpi_info_same_size, "same_size", "true"); + /* MPI_Info_create (&mpi_info_same_size); */ + /* MPI_Info_set (mpi_info_same_size, "same_size", "true"); */ /* Setting img_status */ - MPI_Win_allocate(sizeof(int), 1, mpi_info_same_size, stopped_comm, &img_status, stat_tok); + /* MPI_Win_allocate(sizeof(int), 1, mpi_info_same_size, stopped_comm, &img_status, stat_tok); */ + MPI_Win_allocate(sizeof(int)*caf_num_images, 1, MPI_INFO_NULL, stopped_comm, &stopped_imgs, stat_tok); # ifndef CAF_MPI_LOCK_UNLOCK MPI_Win_lock_all(MPI_MODE_NOCHECK, *stat_tok); # endif // CAF_MPI_LOCK_UNLOCK #else - MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &img_status, stat_tok); - MPI_Win_create(img_status, sizeof(int), 1, MPI_INFO_NULL, stopped_comm, stat_tok); + MPI_Alloc_mem(sizeof(int)*caf_num_images, MPI_INFO_NULL, &stopped_imgs, stat_tok); + MPI_Win_create(stopped_imgs, sizeof(int)*caf_num_images, 1, MPI_INFO_NULL, stopped_comm, stat_tok); #endif // MPI_VERSION - *img_status = 0; + for(i=0;i= 3 - MPI_Info_free (&mpi_info_same_size); + /* MPI_Info_free (&mpi_info_same_size); */ #endif // MPI_VERSION /* MPI_Comm_free(&CAF_COMM_WORLD); */ @@ -625,7 +646,7 @@ int communicator_shrink(MPI_Comm *comm) MPI_Comm_rank(MPI_COMM_WORLD, &crank); /* Split does the magic: removing spare processes and reordering ranks * so that all surviving processes remain at their former place */ - if (*img_status == STAT_STOPPED_IMAGE) + if (img_status == STAT_STOPPED_IMAGE) crank = -1; rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, newcomm); flag = (rc == MPI_SUCCESS); @@ -2819,7 +2840,7 @@ PREFIX (fail_image) (void) int PREFIX (image_status) (int image) { - int i,res=0, remote_stat=0,ierr; + int i,res=0, ierr; for(i=0;ibase_addr = mem; - memcpy(mem,stopped_images,n_stopped_imgs*sizeof(int)); - qsort(mem,n_stopped_imgs,sizeof(int),cmpfunc); array->dtype = 265; array->dim[0].lower_bound = 1; array->dim[0]._ubound = n_stopped_imgs-1; From 168a2539cbd6a073e961b353023a44dd3f3e0f39 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 2 Mar 2017 11:01:11 -0700 Subject: [PATCH 48/61] Update from opencoarrays_ft_rep --- src/mpi/mpi_caf.c | 79 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index e6a9db31a..012f9a9c6 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -439,6 +439,7 @@ _gfortran_caf_init (int *argc, char ***argv) PREFIX (init) (int *argc, char ***argv) #endif { + int rc,flag; if (caf_num_images == 0) { int ierr = 0, i = 0, j = 0; @@ -475,7 +476,12 @@ PREFIX (init) (int *argc, char ***argv) /* Duplicate MPI_COMM_WORLD so that no CAF internal functions use it - this is critical for MPI-interoperability. */ - MPI_Comm_dup(MPI_COMM_WORLD, &CAF_COMM_WORLD); + rc = MPI_Comm_dup(MPI_COMM_WORLD, &CAF_COMM_WORLD); + flag = (MPI_SUCCESS == rc); + flag = MPIX_Comm_agree(MPI_COMM_WORLD,&flag); + if(flag != MPI_SUCCESS) + MPI_Abort(MPI_COMM_WORLD,10000); + MPI_Barrier(MPI_COMM_WORLD); MPI_Comm_size(CAF_COMM_WORLD, &caf_num_images); MPI_Comm_rank(CAF_COMM_WORLD, &caf_this_image); @@ -562,7 +568,7 @@ PREFIX (finalize) (void) failed = 1; break; } - if (!failed) + if (!failed && i != caf_this_image-1) { MPI_Accumulate (&one, 1, MPI_INT, i, (caf_this_image-1)*sizeof(int), 1, MPI_INT, MPI_REPLACE, *stat_tok); } @@ -676,28 +682,31 @@ void * { /* int ierr; */ void *mem; + MPI_Win *p; size_t actual_size; int l_var=0, *init_array=NULL,ierr=0,flag=0; MPI_Win *stopped_win; - + if (unlikely (caf_is_finalized)) goto error; - + /* Start GASNET if not already started. */ if (caf_num_images == 0) #ifdef COMPILER_SUPPORTS_CAF_INTRINSICS _gfortran_caf_init (NULL, NULL); #else - PREFIX (init) (NULL, NULL); + PREFIX (init) (NULL, NULL); #endif - + /* Token contains only a list of pointers. */ - - *token = malloc (sizeof(MPI_Win)); + + /* *token = malloc (sizeof(MPI_Win)); */ + p = malloc(sizeof(MPI_Win)); + *token = p; stopped_win = (MPI_Win *)malloc(sizeof(MPI_Win)); - - MPI_Win *p = *token; - + + /* MPI_Win *p = *token; */ + if(type == CAF_REGTYPE_LOCK_STATIC || type == CAF_REGTYPE_LOCK_ALLOC || type == CAF_REGTYPE_CRITICAL || type == CAF_REGTYPE_EVENT_STATIC || type == CAF_REGTYPE_EVENT_ALLOC) @@ -717,9 +726,9 @@ void * } #if MPI_VERSION >= 3 - MPI_Win_allocate(actual_size, 1, mpi_info_same_size, CAF_COMM_WORLD, &mem, *token); + MPI_Win_allocate(actual_size, 1, MPI_INFO_NULL, CAF_COMM_WORLD, &mem, p); # ifndef CAF_MPI_LOCK_UNLOCK - MPI_Win_lock_all(MPI_MODE_NOCHECK, *p); + MPI_Win_lock_all(0, *p); # endif // CAF_MPI_LOCK_UNLOCK #else // MPI_VERSION MPI_Alloc_mem(actual_size, MPI_INFO_NULL, &mem); @@ -728,7 +737,7 @@ void * p = *token; - MPI_Win_set_errhandler(*p,errh_w); + /* MPI_Win_set_errhandler(*p,errh_w); */ if(error_called == 1) { @@ -897,13 +906,13 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { int ierr=0,flag=0; - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - /* MPI_Barrier(CAF_COMM_WORLD); */ - } + /* if(error_called == 1) */ + /* { */ + /* communicator_shrink(&CAF_COMM_WORLD); */ + /* error_called = 0; */ + /* ierr = STAT_FAILED_IMAGE; */ + /* /\* MPI_Barrier(CAF_COMM_WORLD); *\/ */ + /* } */ if (unlikely (caf_is_finalized)) ierr = STAT_STOPPED_IMAGE; @@ -1196,6 +1205,20 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, if (size == 0) return; + for(j=0;j src_size) { pad_str = alloca (dst_size - src_size); @@ -1571,6 +1594,20 @@ PREFIX (get) (caf_token_t token, size_t offset, if (size == 0) return; + for(j=0;j src_size) { pad_str = alloca (dst_size - src_size); From 5737bcaf4d3fc788868de8db728c6b162a003b79 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 2 Mar 2017 11:03:59 -0700 Subject: [PATCH 49/61] Minor change --- src/mpi/mpi_caf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 012f9a9c6..5d9fdf1e2 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -2877,7 +2877,7 @@ PREFIX (fail_image) (void) int PREFIX (image_status) (int image) { - int i,res=0, ierr; + int i,res=0, ierr=0; for(i=0;i Date: Thu, 2 Mar 2017 11:08:44 -0700 Subject: [PATCH 50/61] First Aurelien patch --- src/mpi/mpi_caf.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 5d9fdf1e2..da12a980a 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -639,10 +639,9 @@ PREFIX (num_images)(int distance __attribute__ ((unused)), int communicator_shrink(MPI_Comm *comm) { int ns,srank,crank,rc,flag,i,drank,nc,nd; - MPI_Comm shrunk, *newcomm; + MPI_Comm shrunk, newcomm; MPI_Group cgrp, sgrp, dgrp; - newcomm = (MPI_Comm *)calloc(1,sizeof(MPI_Comm)); redo: MPIX_Comm_shrink(*comm, &shrunk); MPI_Comm_set_errhandler( shrunk, errh ); @@ -654,7 +653,7 @@ int communicator_shrink(MPI_Comm *comm) * so that all surviving processes remain at their former place */ if (img_status == STAT_STOPPED_IMAGE) crank = -1; - rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, newcomm); + rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, &newcomm); flag = (rc == MPI_SUCCESS); /* Split or some of the communications above may have failed if * new failures have disrupted the process: we need to @@ -663,10 +662,10 @@ int communicator_shrink(MPI_Comm *comm) MPI_Comm_free(&shrunk); if( MPI_SUCCESS != flag ) { - if( MPI_SUCCESS == rc ) MPI_Comm_free(*newcomm); + if( MPI_SUCCESS == rc ) MPI_Comm_free(newcomm); goto redo; } - *comm = *newcomm; + *comm = newcomm; return MPI_SUCCESS; } From 798f073d1033dec0c3147b8e022d149ab1882bf4 Mon Sep 17 00:00:00 2001 From: Alessandro Fanfarillo Date: Thu, 2 Mar 2017 15:45:15 -0700 Subject: [PATCH 51/61] Fixed register for GCC-7 compatibility --- src/libcaf.h | 4 ++-- src/mpi/mpi_caf.c | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/libcaf.h b/src/libcaf.h index 9a0a3144f..a2ac3e37b 100644 --- a/src/libcaf.h +++ b/src/libcaf.h @@ -117,8 +117,8 @@ void PREFIX (finalize) (void); int PREFIX (this_image) (int); int PREFIX (num_images) (int, int); -void *PREFIX (register) (size_t, caf_register_t, caf_token_t *, int *, char *, - int); +void PREFIX (register) (size_t, caf_register_t, caf_token_t *, gfc_descriptor_t *, + int *, char *, int); void PREFIX (deregister) (caf_token_t *, int *, char *, int); void PREFIX (caf_get) (caf_token_t, size_t, int, gfc_descriptor_t *, diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index da12a980a..aafd4dc51 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -674,9 +674,10 @@ void * _gfortran_caf_register (size_t size, caf_register_t type, caf_token_t *token, int *stat, char *errmsg, int errmsg_len) #else -void * +void PREFIX (register) (size_t size, caf_register_t type, caf_token_t *token, - int *stat, char *errmsg, int errmsg_len) + gfc_descriptor_t *desc, int *stat, char *errmsg, + int errmsg_len) #endif { /* int ierr; */ @@ -787,7 +788,9 @@ void * else if (ierr == STAT_FAILED_IMAGE) error_stop (ierr); - return mem; + desc->base_addr = mem; + + return; error: { From 7e1054c7c16b89c7d204db924f2a571cf97d750e Mon Sep 17 00:00:00 2001 From: Andre Vehreschild Date: Fri, 10 Mar 2017 18:14:07 +0100 Subject: [PATCH 52/61] Added first testcase. --- .../unit/image_states/image_status_test_1.f90 | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 src/tests/unit/image_states/image_status_test_1.f90 diff --git a/src/tests/unit/image_states/image_status_test_1.f90 b/src/tests/unit/image_states/image_status_test_1.f90 new file mode 100644 index 000000000..13f8cbb7e --- /dev/null +++ b/src/tests/unit/image_states/image_status_test_1.f90 @@ -0,0 +1,16 @@ +program test_image_status_1 + use iso_fortran_env , only : STAT_STOPPED_IMAGE + implicit none + integer :: i + + associate(np => num_images(), me => this_image()) + do i= 1, np + if (image_status(i) /= 0) error stop "image_status(i) should not fail" + end do + + sync all + if (me == 1) print *,"Test passed." + end associate + +end program test_image_status_1 + From f1ed29e3cc3f1def14c5cf961cc13737d4ed752a Mon Sep 17 00:00:00 2001 From: Andre Vehreschild Date: Sat, 11 Mar 2017 11:11:20 +0100 Subject: [PATCH 53/61] Added missing cmake-file for image_states-tests. --- src/tests/unit/image_states/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 src/tests/unit/image_states/CMakeLists.txt diff --git a/src/tests/unit/image_states/CMakeLists.txt b/src/tests/unit/image_states/CMakeLists.txt new file mode 100644 index 000000000..cd3ef86ca --- /dev/null +++ b/src/tests/unit/image_states/CMakeLists.txt @@ -0,0 +1,3 @@ +add_executable(image_status_test_1 image_status_test_1.f90) +target_link_libraries(image_status_test_1 OpenCoarrays) + From dca073880c9b1b031f3bbc72958710786070f2d6 Mon Sep 17 00:00:00 2001 From: Andre Vehreschild Date: Sun, 30 Apr 2017 13:18:14 +0200 Subject: [PATCH 54/61] Implement FAILED IMAGES support for gcc >= 7 and mpich >= 3.2. Add testcases for failed images. Fixes #309. Fixes #354. --- CMakeLists.txt | 33 +- src/libcaf.h | 10 +- src/mpi/mpi_caf.c | 1333 +++++++++-------- src/tests/unit/CMakeLists.txt | 2 +- src/tests/unit/fail_images/CMakeLists.txt | 27 + .../image_fail_and_failed_images_test_1.f90 | 31 + .../fail_images/image_fail_and_get_test_1.f90 | 32 + .../image_fail_and_status_test_1.f90 | 33 + .../image_fail_and_stopped_images_test_1.f90 | 30 + .../image_fail_and_sync_test_1.f90 | 27 + .../image_fail_and_sync_test_2.f90 | 25 + .../image_fail_and_sync_test_3.f90 | 24 + .../unit/fail_images/image_fail_test_1.f90 | 21 + .../image_status_test_1.f90 | 2 + src/tests/unit/image_states/CMakeLists.txt | 3 - 15 files changed, 1010 insertions(+), 623 deletions(-) create mode 100644 src/tests/unit/fail_images/CMakeLists.txt create mode 100644 src/tests/unit/fail_images/image_fail_and_failed_images_test_1.f90 create mode 100644 src/tests/unit/fail_images/image_fail_and_get_test_1.f90 create mode 100644 src/tests/unit/fail_images/image_fail_and_status_test_1.f90 create mode 100644 src/tests/unit/fail_images/image_fail_and_stopped_images_test_1.f90 create mode 100644 src/tests/unit/fail_images/image_fail_and_sync_test_1.f90 create mode 100644 src/tests/unit/fail_images/image_fail_and_sync_test_2.f90 create mode 100644 src/tests/unit/fail_images/image_fail_and_sync_test_3.f90 create mode 100644 src/tests/unit/fail_images/image_fail_test_1.f90 rename src/tests/unit/{image_states => fail_images}/image_status_test_1.f90 (82%) delete mode 100644 src/tests/unit/image_states/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 51250feb1..9a84f394b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -384,6 +384,24 @@ function(add_mpi_test name num_mpi_proc path) set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.") endfunction(add_mpi_test) +function(add_mpi_failable_test name num_mpi_proc path) + if ( ((N LESS num_mpi_proc) OR (N EQUAL 0)) ) + message(STATUS "Test ${name} is oversubscribed: ${num_mpi_proc} ranks requested with ${N} system processor available.") + if ( openmpi ) + if ( N LESS 2 ) + set( num_mpi_proc 2 ) + set (test_parameters --oversubscribe) + else() + set ( num_mpi_proc ${N} ) + endif() + message( STATUS "Open-MPI detected, over-riding oversubscribed test, ${name}, with ${num_mpi_proc} ranks." ) + endif() + endif() + set(test_parameters ${test_parameters} ${MPIEXEC_NUMPROC_FLAG} ${num_mpi_proc} -disable-auto-cleanup ) + add_test(NAME ${name} COMMAND ${MPIEXEC} ${test_parameters} "${path}") + set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.") +endfunction(add_mpi_failable_test) + set(tests_root ${CMAKE_CURRENT_BINARY_DIR}/src/tests) @@ -447,7 +465,20 @@ if(opencoarrays_aware_compiler) add_mpi_test(co_reduce_string 4 ${tests_root}/unit/collectives/co_reduce_string) # IMAGE FAIL tests - add_mpi_test(image_status_test_1 4 ${tests_root}/unit/image_states/image_status_test_1) + add_mpi_test(image_status_test_1 4 ${tests_root}/unit/fail_images/image_status_test_1) +#ifdef WITH_FAIL_IMAGES +# No other way to check that image_fail_test_1 passes. + add_mpi_failable_test(image_fail_test_1 4 ${tests_root}/unit/fail_images/image_fail_test_1) + set_property(TEST image_fail_test_1 PROPERTY FAIL_REGULAR_EXPRESSION "Test failed") + set_property(TEST image_fail_test_1 PROPERTY PASS_REGULAR_EXPRESSION "Test passed") + add_mpi_failable_test(image_fail_and_sync_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_1) + add_mpi_failable_test(image_fail_and_sync_test_2 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_2) + add_mpi_failable_test(image_fail_and_sync_test_3 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_3) + add_mpi_failable_test(image_fail_and_status_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_status_test_1) + add_mpi_failable_test(image_fail_and_failed_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_failed_images_test_1) + add_mpi_failable_test(image_fail_and_stopped_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_stopped_images_test_1) + add_mpi_failable_test(image_fail_and_get_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_get_test_1) +#endif else() add_test(co_sum_extension ${tests_root}/unit/extensions/test-co_sum-extension.sh) set_property(TEST co_sum_extension PROPERTY PASS_REGULAR_EXPRESSION "Test passed.") diff --git a/src/libcaf.h b/src/libcaf.h index 4ebb57fef..3bf9aa593 100644 --- a/src/libcaf.h +++ b/src/libcaf.h @@ -240,7 +240,8 @@ void PREFIX (caf_send) (caf_token_t, size_t, int, gfc_descriptor_t *, void PREFIX (caf_sendget) (caf_token_t, size_t, int, gfc_descriptor_t *, caf_vector_t *, caf_token_t, size_t, int, - gfc_descriptor_t *, caf_vector_t *, int, int, bool); + gfc_descriptor_t *, caf_vector_t *, int, int, bool, + int *); #ifdef GCC_GE_7 void PREFIX(get_by_ref) (caf_token_t, int, @@ -269,9 +270,16 @@ void PREFIX (sync_all) (int *, char *, int); void PREFIX (sync_images) (int, int[], int *, char *, int); void PREFIX (sync_memory) (int *, char *, int); +void PREFIX (stop_str) (const char *, int32_t) __attribute__ ((noreturn)); +void PREFIX (stop) (int32_t) __attribute__ ((noreturn)); void PREFIX (error_stop_str) (const char *, int32_t) __attribute__ ((noreturn)); void PREFIX (error_stop) (int32_t) __attribute__ ((noreturn)); +void PREFIX (fail_image) (void) __attribute__ ((noreturn)); + +int PREFIX (image_status) (int); +void PREFIX (failed_images) (gfc_descriptor_t *, int, int *); +void PREFIX (stopped_images) (gfc_descriptor_t *, int, int *); void PREFIX (atomic_define) (caf_token_t, size_t, int, void *, int *, int, int); void PREFIX (atomic_ref) (caf_token_t, size_t, int, void *, int *, int, int); diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index c541b679d..ebf807e2d 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -41,12 +41,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include +#include /* For raise */ -#ifdef MPI_HAS_MPI_EXT +#ifdef MPI_NEEDS_MPI_EXT #include -#ifdef USE_FAILED_IMAGES - #define WITH_MPI_FAILED 1 #endif +#ifdef USE_FAILED_IMAGES + #define WITH_FAILED_IMAGES 1 #endif #include "libcaf.h" @@ -55,6 +56,12 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* #define GFC_CAF_CHECK 1 */ +#ifdef NDEBUG +#define dprint(...) +#else +#define dprint(args...) fprintf (stderr, args) +#endif + #ifdef GCC_GE_7 /** The caf-token of the mpi-library. @@ -90,7 +97,12 @@ typedef MPI_Win *mpi_caf_token_t; #define TOKEN(X) ((mpi_caf_token_t) (X)) #endif -static void error_stop (int error) __attribute__ ((noreturn)); +/* Forward declaration of prototype. */ + +static void terminate_internal (int stat_code, int exit_code) + __attribute__ ((noreturn)); +static void sync_images_internal (int count, int images[], int *stat, + char *errmsg, int errmsg_len, bool internal); /* Global variables. */ static int caf_this_image; @@ -101,12 +113,12 @@ static int caf_is_finalized = 0; MPI_Info mpi_info_same_size; #endif // MPI_VERSION -/*Sync image part*/ +/* Variables needed for syncing images. */ -static int *orders; static int *images_full; MPI_Request *sync_handles; static int *arrived; +static const int MPI_TAG_CAF_SYNC_IMAGES = 424242; /* Pending puts */ #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK) @@ -145,57 +157,36 @@ char err_buffer[MPI_MAX_ERROR_STRING]; MPI_Comm CAF_COMM_WORLD; #ifdef WITH_FAILED_IMAGES -/* Failed Images */ -MPI_Comm lock_comm, stopped_comm; -MPI_Request lock_req, stopped_req; -int used_comm = -1, n_failed_imgs=0; -int error_called = 0, fake_error_called = 0; -int *ranks_gc, *ranks_gf, *failed_images_array; -MPI_Errhandler errh, errh_w, errh_fake; -int completed = 0, tmp_lock; -int *stopped_imgs, n_stopped_imgs; - -static int cmpfunc (const void *a, const void *b) -{ - return ( *(int*)a - *(int*)b ); -} +/* The stati of the other images. image_stati is an array of size + * caf_num_images at the beginning the status of each image is noted here + * where the index is the image number minus one. */ +int *image_stati; -static void verbose_win_errhandler(MPI_Win* win, int* err, ...) { - /* printf("in win err handler\n"); */ - /* used_comm++; */ - /* CAF_COMM_WORLD = communicators[used_comm]; */ -} +/* This gives the number of all images that are known to have failed. */ +int num_images_failed = 0; -static void verbose_comm_errhandler(MPI_Comm* pcomm, int* err, ...){ - MPI_Comm comm; - int nf, i, old_nf, j; - MPI_Group group_c, group_f; - comm = *pcomm; +/* This is the number of all images that are known to have stopped. */ +int num_images_stopped = 0; - old_nf = n_failed_imgs; - - MPIX_Comm_failure_ack(comm); - MPIX_Comm_failure_get_acked(comm, &group_f); - MPI_Group_size(group_f, &nf); - /* MPI_Comm_group(comm, &group_c); */ - MPI_Comm_group(MPI_COMM_WORLD, &group_c); - for(i = 0; i < nf; i++) - ranks_gf[i] = i; - MPI_Group_translate_ranks(group_f, nf, ranks_gf, - group_c, ranks_gc); - - n_failed_imgs += nf; - j=0; - - for(i = old_nf; i < n_failed_imgs; i++) - { - failed_images_array[i] = ranks_gc[j]; - failed_images_array[i]++; - j++; - } - - error_called = 1; -} +/* The async. request-handle to all participating images. */ +MPI_Request alive_request; + +/* This dummy is used for the alive request. Its content is arbitrary and + * never read. Its just a memory location where one could put something, + * which is never done. */ +int alive_dummy; + +/* The mpi error-handler object associate to CAF_COMM_WORLD. */ +MPI_Errhandler failed_stopped_CAF_COMM_WORLD_mpi_errorhandler; + +/* The monitor comm for detecting failed images. We can not attach the monitor + * to CAF_COMM_WORLD or the messages send by sync images would be caught by + * the monitor. */ +MPI_Comm alive_comm; + +/* Set when entering a sync_images_internal, to prevent the error handler from + * eating our messages. */ +bool no_stopped_images_check_in_errhandler = 0; #endif /* For MPI interoperability, allow external initialization @@ -304,7 +295,10 @@ void helperFunction() } } #endif + + /* Keep in sync with single.c. */ + static void caf_runtime_error (const char *message, ...) { @@ -323,50 +317,204 @@ caf_runtime_error (const char *message, ...) exit (EXIT_FAILURE); } -/* FIXME: CMake chokes on the "inline" keyword below. If we can detect that CMake is */ -/* being used, we could add something of the form "#ifdef _CMAKE" to remove the */ -/* keyword only when building with CMake */ -/* inline */ void locking_atomic_op(MPI_Win win, int *value, int newval, - int compare, int image_index, int index) +/* Forward declaration of the feature unsupported message for failed images + * functions. */ +static void +unsupported_fail_images_message(const char * functionname); + +/* Forward declaration of the feature unimplemented message for allocatable + * components. */ +static void +unimplemented_alloc_comps_message(const char * functionname); + +static void +locking_atomic_op(MPI_Win win, int *value, int newval, + int compare, int image_index, int index) { - CAF_Win_lock (MPI_LOCK_EXCLUSIVE, image_index-1, win); - MPI_Compare_and_swap (&newval,&compare,value, MPI_INT,image_index-1, - index*sizeof(int), win); - CAF_Win_unlock (image_index-1, win); + CAF_Win_lock (MPI_LOCK_EXCLUSIVE, image_index-1, win); + MPI_Compare_and_swap (&newval,&compare,value, MPI_INT,image_index-1, + index*sizeof(int), win); + CAF_Win_unlock (image_index-1, win); } + +/* Define a helper to check whether the image at the given index is healthy, + * i.e., it hasn't failed. */ #ifdef WITH_FAILED_IMAGES -int communicator_shrink(MPI_Comm *comm) +#define check_image_health(image_index, stat) \ + if (image_stati[image_index - 1] == STAT_FAILED_IMAGE) \ + { \ + if (stat == NULL) terminate_internal (STAT_FAILED_IMAGE, 0); \ + *stat = STAT_FAILED_IMAGE; \ + return; \ + } +#else +#define check_image_health(image_index, stat) +#endif + +#ifdef WITH_FAILED_IMAGES +/** Handle failed image's errors and try to recover the remaining process to + * allow the user to detect an image fail and exit gracefully. */ +static void +failed_stopped_errorhandler_function (MPI_Comm* pcomm, int* perr, ...) { - int ns,srank,crank,rc,flag,i,drank,nc,nd; - MPI_Comm shrunk, newcomm; - MPI_Group cgrp, sgrp, dgrp; + MPI_Comm comm, shrunk, newcomm; + int num_failed_in_group, i, err; + MPI_Group comm_world_group, failed_group; + int *ranks_of_failed_in_comm_world, *ranks_failed; + int ns, srank, crank, rc, flag, drank, ierr, newrank; + bool stopped = false; + + comm = *pcomm; + + MPI_Error_class (*perr, &err); + if (err != MPIX_ERR_PROC_FAILED && err != MPIX_ERR_REVOKED) + { + /* We can handle PROC_FAILED and REVOKED ones only. */ + char errstr[MPI_MAX_ERROR_STRING]; + int errlen; + MPI_Error_string (err, errstr, &errlen); + /* We can't use caf_runtime_error here, because that would exit, which + * means only the one process will stop, but we need to stop MPI + * completely, which can be done by calling MPI_Abort(). */ + fprintf (stderr, "Fortran runtime error on image #%d:\nMPI error: '%s'.\n", + caf_this_image, errstr); + MPI_Abort (*pcomm, err); + } + + dprint ("%d/%d: %s (error = %d)\n", caf_this_image, caf_num_images, __FUNCTION__, err); + + MPIX_Comm_failure_ack (comm); + MPIX_Comm_failure_get_acked (comm, &failed_group); + MPI_Group_size (failed_group, &num_failed_in_group); + + dprint ("%d/%d: %s: %d images failed.\n", caf_this_image, caf_num_images, __FUNCTION__, num_failed_in_group); + if (num_failed_in_group <= 0) + { + *perr = MPI_SUCCESS; + return; + } + + MPI_Comm_group (comm, &comm_world_group); + ranks_of_failed_in_comm_world = (int *) alloca (sizeof (int) + * num_failed_in_group); + ranks_failed = (int *) alloca (sizeof (int) * num_failed_in_group); + for (i = 0; i < num_failed_in_group; ++i) + ranks_failed[i] = i; + /* Now translate the ranks of the failed images into communicator world. */ + MPI_Group_translate_ranks (failed_group, num_failed_in_group, ranks_failed, + comm_world_group, ranks_of_failed_in_comm_world); + + num_images_failed += num_failed_in_group; + + if (!no_stopped_images_check_in_errhandler) + { + int buffer, flag; + MPI_Request req; + MPI_Status request_status; + dprint ("%d/%d: Checking for stopped images.\n", caf_this_image, + caf_num_images); + ierr = MPI_Irecv (&buffer, 1, MPI_INT, MPI_ANY_SOURCE, MPI_TAG_CAF_SYNC_IMAGES, + CAF_COMM_WORLD, &req); + if (ierr == MPI_SUCCESS) + { + ierr = MPI_Test (&req, &flag, &request_status); + if (flag) + { + // Received a result + if (buffer == STAT_STOPPED_IMAGE) + { + dprint ("%d/%d: Image #%d found stopped.\n", + caf_this_image, caf_num_images, request_status.MPI_SOURCE); + stopped = true; + if (image_stati[request_status.MPI_SOURCE] == 0) + ++num_images_stopped; + image_stati[request_status.MPI_SOURCE] = STAT_STOPPED_IMAGE; + } + } + else + { + dprint ("%d/%d: No stopped images found.\n", + caf_this_image, caf_num_images); + MPI_Cancel (&req); + } + } + else + { + int err; + MPI_Error_class (ierr, &err); + dprint ("%d/%d: Error on checking for stopped images %d.\n", + caf_this_image, caf_num_images, err); + } + } + + /* TODO: Consider whether removing the failed image from images_full will be + * necessary. This is more or less politics. */ + for (i = 0; i < num_failed_in_group; ++i) + if (image_stati[ranks_of_failed_in_comm_world[i]] == 0) + image_stati[ranks_of_failed_in_comm_world[i]] = STAT_FAILED_IMAGE; + +redo: + dprint ("%d/%d: %s: Before shrink. \n", caf_this_image, caf_num_images, __FUNCTION__); + ierr = MPIX_Comm_shrink (*pcomm, &shrunk); + dprint ("%d/%d: %s: After shrink, rc = %d.\n", caf_this_image, caf_num_images, __FUNCTION__, ierr); + MPI_Comm_set_errhandler (shrunk, failed_stopped_CAF_COMM_WORLD_mpi_errorhandler); + MPI_Comm_size (shrunk, &ns); + MPI_Comm_rank (shrunk, &srank); - redo: - MPIX_Comm_shrink(*comm, &shrunk); - MPI_Comm_set_errhandler( shrunk, errh ); - MPI_Comm_size(shrunk, &ns); MPI_Comm_rank(shrunk, &srank); + MPI_Comm_rank (*pcomm, &crank); + + dprint ("%d/%d: %s: After getting ranks, ns = %d, srank = %d, crank = %d.\n", + caf_this_image, caf_num_images, __FUNCTION__, ns, srank, crank); - // MPI_Comm_rank(*comm, &crank); - MPI_Comm_rank(MPI_COMM_WORLD, &crank); /* Split does the magic: removing spare processes and reordering ranks * so that all surviving processes remain at their former place */ - if (img_status == STAT_STOPPED_IMAGE) - crank = -1; - rc = MPI_Comm_split(shrunk, crank<0?MPI_UNDEFINED:1, crank, &newcomm); + rc = MPI_Comm_split (shrunk, crank < 0 ? MPI_UNDEFINED : 1, crank, &newcomm); + MPI_Comm_rank (newcomm, &newrank); + dprint ("%d/%d: %s: After split, rc = %d, rank = %d.\n", caf_this_image, caf_num_images, __FUNCTION__, rc, newrank); flag = (rc == MPI_SUCCESS); /* Split or some of the communications above may have failed if * new failures have disrupted the process: we need to * make sure we succeeded at all ranks, or retry until it works. */ - flag = MPIX_Comm_agree(shrunk, &flag); + flag = MPIX_Comm_agree (newcomm, &flag); + dprint ("%d/%d: %s: After agree, flag = %d.\n", caf_this_image, caf_num_images, __FUNCTION__, flag); + + MPI_Comm_rank (newcomm, &drank); + dprint ("%d/%d: %s: After rank, drank = %d.\n", caf_this_image, caf_num_images, __FUNCTION__, drank); - MPI_Comm_free(&shrunk); - if( MPI_SUCCESS != flag ) { - if( MPI_SUCCESS == rc ) MPI_Comm_free(&newcomm); + MPI_Comm_free (&shrunk); + if (MPI_SUCCESS != flag) { + if (MPI_SUCCESS == rc) + MPI_Comm_free (&newcomm); goto redo; } - *comm = newcomm; - return MPI_SUCCESS; + + { + int cmpres; + ierr = MPI_Comm_compare (*pcomm, CAF_COMM_WORLD, &cmpres); + dprint ("%d/%d: %s: Comm_compare(*comm, CAF_COMM_WORLD, res = %d) = %d.\n", caf_this_image, + caf_num_images, __FUNCTION__, cmpres, ierr); + ierr = MPI_Comm_compare (*pcomm, alive_comm, &cmpres); + dprint ("%d/%d: %s: Comm_compare(*comm, alive_comm, res = %d) = %d.\n", caf_this_image, + caf_num_images, __FUNCTION__, cmpres, ierr); + if (cmpres == MPI_CONGRUENT) + { + MPI_Win_detach (*stat_tok, &img_status); + dprint ("%d/%d: %s: detached win img_status.\n", caf_this_image, caf_num_images, __FUNCTION__); + MPI_Win_free (stat_tok); + dprint ("%d/%d: %s: freed win img_status.\n", caf_this_image, caf_num_images, __FUNCTION__); + MPI_Win_create (&img_status, sizeof (int), 1, mpi_info_same_size, newcomm, + stat_tok); + dprint ("%d/%d: %s: (re-)created win img_status.\n", caf_this_image, caf_num_images, __FUNCTION__); + CAF_Win_lock_all (*stat_tok); + dprint ("%d/%d: %s: Win_lock_all on img_status.\n", caf_this_image, caf_num_images, __FUNCTION__); + } + } + /* Also free the old communicator before replacing it. */ + MPI_Comm_free (pcomm); + *pcomm = newcomm; + + *perr = stopped ? STAT_STOPPED_IMAGE : STAT_FAILED_IMAGE; } #endif @@ -375,29 +523,21 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, { const char msg[] = "Already locked"; #if MPI_VERSION >= 3 - int value = 0, compare = 0, newval = caf_this_image, i = 1, ierr=0; + int value = 0, compare = 0, newval = caf_this_image, ierr = 0, i = 0; #ifdef WITH_FAILED_IMAGES - int flag, it = 0, check_failure = 100, zero = 0; + int flag, check_failure = 100, zero = 0; #endif if(stat != NULL) *stat = 0; #ifdef WITH_FAILED_IMAGES - MPI_Test(&lock_req, &flag, MPI_STATUS_IGNORE); - - if(error_called == 1) - { - MPIX_Comm_revoke(CAF_COMM_WORLD); - communicator_shrink(&CAF_COMM_WORLD); - communicator_shrink(&lock_comm); - error_called = 0; - } + MPI_Test(&alive_request, &flag, MPI_STATUS_IGNORE); #endif - locking_atomic_op(win, &value, newval, compare, image_index, index); + locking_atomic_op (win, &value, newval, compare, image_index, index); - if(value == caf_this_image && image_index == caf_this_image) + if (value == caf_this_image && image_index == caf_this_image) goto stat_error; if(acquired_lock != NULL) @@ -409,63 +549,50 @@ void mutex_lock(MPI_Win win, int image_index, int index, int *stat, return; } - while(value != 0) + while (value != 0) { + ++i; #ifdef WITH_FAILED_IMAGES - it++; - - if(it == check_failure) - { - it = 1; - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); - } - - if(error_called == 1) - { - MPIX_Comm_revoke(CAF_COMM_WORLD); - communicator_shrink(&CAF_COMM_WORLD); - communicator_shrink(&lock_comm); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } + if (i == check_failure) + { + i = 1; + MPI_Test (&alive_request, &flag, MPI_STATUS_IGNORE); + } #endif locking_atomic_op(win, &value, newval, compare, image_index, index); #ifdef WITH_FAILED_IMAGES - for(i=0;i= 3 int value=1, ierr = 0, newval = 0; #ifdef WITH_FAILED_IMAGES - int flag, compare = 1; - - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); + int flag; - if(error_called == 1) - { - MPIX_Comm_revoke(CAF_COMM_WORLD); - communicator_shrink(&CAF_COMM_WORLD); - communicator_shrink(&lock_comm); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } + MPI_Test(&alive_request, &flag, MPI_STATUS_IGNORE); #endif CAF_Win_lock (MPI_LOCK_EXCLUSIVE, image_index-1, win); @@ -506,7 +624,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, if(stat) *stat = ierr; else if(ierr == STAT_FAILED_IMAGE) - error_stop (ierr); + terminate_internal (ierr, 0); return; @@ -519,7 +637,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, if(stat != NULL) *stat = 99; else - error_stop(99); + terminate_internal(99, 1); #else // MPI_VERSION #warning Locking for MPI-2 is not implemented printf ("Locking for MPI-2 is not supported, please update your MPI implementation\n"); @@ -530,11 +648,7 @@ void mutex_unlock(MPI_Win win, int image_index, int index, int *stat, GASNet initialization happened before. */ void -#ifdef COMPILER_SUPPORTS_CAF_INTRINSICS -_gfortran_caf_init (int *argc, char ***argv) -#else PREFIX (init) (int *argc, char ***argv) -#endif { #ifdef WITH_FAILED_IMAGES int flag; @@ -542,15 +656,12 @@ PREFIX (init) (int *argc, char ***argv) if (caf_num_images == 0) { int ierr = 0, i = 0, j = 0, rc; -#ifdef WITH_FAILED_IMAGES - n_failed_imgs = 0; -#endif int is_init = 0, prior_thread_level = MPI_THREAD_SINGLE; - MPI_Initialized(&is_init); + MPI_Initialized (&is_init); if (is_init) { - MPI_Query_thread(&prior_thread_level); + MPI_Query_thread (&prior_thread_level); } #ifdef HELPER int prov_lev=0; @@ -558,17 +669,17 @@ PREFIX (init) (int *argc, char ***argv) prov_lev = prior_thread_level; caf_owns_mpi = false; } else { - MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &prov_lev); + MPI_Init_thread (argc, argv, MPI_THREAD_MULTIPLE, &prov_lev); caf_owns_mpi = true; } - if(caf_this_image == 0 && MPI_THREAD_MULTIPLE != prov_lev) + if (caf_this_image == 0 && MPI_THREAD_MULTIPLE != prov_lev) caf_runtime_error ("MPI_THREAD_MULTIPLE is not supported: %d", prov_lev); #else if (is_init) { caf_owns_mpi = false; } else { - MPI_Init(argc, argv); + MPI_Init (argc, argv); caf_owns_mpi = true; } #endif @@ -577,129 +688,137 @@ PREFIX (init) (int *argc, char ***argv) /* Duplicate MPI_COMM_WORLD so that no CAF internal functions use it - this is critical for MPI-interoperability. */ - rc = MPI_Comm_dup(MPI_COMM_WORLD, &CAF_COMM_WORLD); + rc = MPI_Comm_dup (MPI_COMM_WORLD, &CAF_COMM_WORLD); #ifdef WITH_FAILED_IMAGES flag = (MPI_SUCCESS == rc); - flag = MPIX_Comm_agree(MPI_COMM_WORLD,&flag); - if(flag != MPI_SUCCESS) - MPI_Abort(MPI_COMM_WORLD,10000); - MPI_Barrier(MPI_COMM_WORLD); + rc = MPIX_Comm_agree (MPI_COMM_WORLD, &flag); + if (rc != MPI_SUCCESS) { + dprint ("%d/%d: %s: MPIX_Comm_agree(flag = %d) = %d.\n", + caf_this_image, caf_num_images, __FUNCTION__, flag, rc); + fflush (stderr); + MPI_Abort (MPI_COMM_WORLD, 10000); + } + MPI_Barrier (MPI_COMM_WORLD); #endif - MPI_Comm_size(CAF_COMM_WORLD, &caf_num_images); - MPI_Comm_rank(CAF_COMM_WORLD, &caf_this_image); + MPI_Comm_size (CAF_COMM_WORLD, &caf_num_images); + MPI_Comm_rank (CAF_COMM_WORLD, &caf_this_image); - caf_this_image++; + ++caf_this_image; caf_is_finalized = 0; + /* BEGIN SYNC IMAGE preparation + * Prepare memory for syncing images. */ images_full = (int *) calloc (caf_num_images-1, sizeof (int)); - - for (i = 1; i <= caf_num_images; ++i) + for (i = 1, j = 0; i <= caf_num_images; ++i) if (i != caf_this_image) - { - images_full[j] = i; - j++; - } + images_full[j++] = i; - orders = calloc (caf_num_images, sizeof (int)); arrived = calloc (caf_num_images, sizeof (int)); + sync_handles = malloc (caf_num_images * sizeof (MPI_Request)); + /* END SYNC IMAGE preparation. */ - sync_handles = malloc(caf_num_images * sizeof(MPI_Request)); - - stat_tok = malloc (sizeof(MPI_Win)); + stat_tok = malloc (sizeof (MPI_Win)); #ifdef WITH_FAILED_IMAGES - MPI_Comm_create_errhandler(verbose_comm_errhandler, &errh); - /* MPI_Comm_create_errhandler(fake_comm_errhandler, &errh_fake); */ - MPI_Comm_set_errhandler(CAF_COMM_WORLD, errh); - - MPI_Comm_dup(CAF_COMM_WORLD, &lock_comm); - /* MPI_Comm_set_errhandler(lock_comm, errh_fake); */ - MPI_Comm_set_errhandler(lock_comm, errh); - MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,lock_comm,&lock_req); - - MPI_Comm_dup(CAF_COMM_WORLD, &stopped_comm); - MPI_Comm_set_errhandler(stopped_comm, errh); - MPI_Irecv(&tmp_lock,1,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,stopped_comm,&stopped_req); + MPI_Comm_dup (MPI_COMM_WORLD, &alive_comm); + /* Handling of failed/stopped images is done by setting an error handler + * on a asynchronous request to each other image. For a failing image + * the request will trigger the call of the error handler thus allowing + * each other image to handle the failed/stopped image. */ + MPI_Comm_create_errhandler (failed_stopped_errorhandler_function, + &failed_stopped_CAF_COMM_WORLD_mpi_errorhandler); + MPI_Comm_set_errhandler (CAF_COMM_WORLD, + failed_stopped_CAF_COMM_WORLD_mpi_errorhandler); + MPI_Comm_set_errhandler (alive_comm, + failed_stopped_CAF_COMM_WORLD_mpi_errorhandler); + MPI_Comm_set_errhandler (MPI_COMM_WORLD, MPI_ERRORS_RETURN); + + MPI_Irecv (&alive_dummy, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, + alive_comm, &alive_request); + + image_stati = (int *) calloc (caf_num_images, sizeof (int)); +#endif - MPI_Win_create_errhandler(verbose_win_errhandler, &errh_w); - - ranks_gf = (int*)calloc(caf_num_images,sizeof(int)); - ranks_gc = (int*)calloc(caf_num_images,sizeof(int)); - failed_images_array = (int*)calloc(caf_num_images,sizeof(int)); - /* stopped_imgs = (int*)calloc(caf_num_images, sizeof(int)); */ -#else -# if MPI_VERSION >= 3 +#if MPI_VERSION >= 3 MPI_Info_create (&mpi_info_same_size); MPI_Info_set (mpi_info_same_size, "same_size", "true"); -# endif -#endif -#if MPI_VERSION >= 3 /* Setting img_status */ - MPI_Win_create(&img_status, sizeof(int), 1, mpi_info_same_size, CAF_COMM_WORLD, stat_tok); + MPI_Win_create (&img_status, sizeof(int), 1, mpi_info_same_size, CAF_COMM_WORLD, stat_tok); CAF_Win_lock_all (*stat_tok); #else - MPI_Win_create(&img_status, sizeof(int), 1, MPI_INFO_NULL, CAF_COMM_WORLD, stat_tok); + MPI_Win_create (&img_status, sizeof(int), 1, MPI_INFO_NULL, CAF_COMM_WORLD, stat_tok); #endif // MPI_VERSION } } -/* Forward declaration of sync_images. */ -void -sync_images_internal (int count, int images[], int *stat, char *errmsg, - int errmsg_len, bool internal); - -/* Finalize coarray program. */ +/* Internal finalize of coarray program. */ void -#ifdef COMPILER_SUPPORTS_CAF_INTRINSICS -_gfortran_caf_finalize (void) -#else -PREFIX (finalize) (void) -#endif +finalize_internal (int status_code) { -#ifdef WITH_FAILED_IMAGES - int flag = 0, i = 0, j = 0, failed = 0, one = 1; - stopped_imgs[caf_this_image-1] = 1; - /* MPI_Win_sync(*stat_tok); */ + dprint ("%d/%d: %s(status_code = %d)\n", + caf_this_image, caf_num_images, __FUNCTION__, status_code); - for(i=0;iprev; p = TOKEN(tmp_tot->token); + dprint ("%d/%d: %s: Before CAF_Win_unlock_all (*p)\n", + caf_this_image, caf_num_images, __FUNCTION__); CAF_Win_unlock_all (*p); + dprint ("%d/%d: %s: After CAF_Win_unlock_all (*p)\n", + caf_this_image, caf_num_images, __FUNCTION__); #ifdef GCC_GE_7 /* Unregister the window to the descriptors when freeing the token. */ if (((mpi_caf_token_t *)tmp_tot->token)->desc) { mpi_caf_token_t *mpi_token = (mpi_caf_token_t *)tmp_tot->token; - CAF_Win_unlock_all(*(mpi_token->desc)); + CAF_Win_unlock_all (*(mpi_token->desc)); MPI_Win_free (mpi_token->desc); free (mpi_token->desc); } #endif // GCC_GE_7 - MPI_Win_free(p); - free(tmp_tot); + MPI_Win_free (p); + free (tmp_tot); tmp_tot = prev; } #if MPI_VERSION >= 3 MPI_Info_free (&mpi_info_same_size); #endif // MPI_VERSION +#ifdef WITH_FAILED_IMAGES + if (status_code == 0) + { + dprint ("%d/%d: %s: before Win_unlock_all.\n", + caf_this_image, caf_num_images, __FUNCTION__); + CAF_Win_unlock_all (*stat_tok); + dprint ("%d/%d: %s: before Win_free(stat_tok)\n", + caf_this_image, caf_num_images, __FUNCTION__); + MPI_Win_free (stat_tok); + dprint ("%d/%d: %s: before Comm_free(CAF_COMM_WORLD)\n", + caf_this_image, caf_num_images, __FUNCTION__); + MPI_Comm_free (&CAF_COMM_WORLD); + MPI_Comm_free (&alive_comm); + dprint ("%d/%d: %s: after Comm_free(CAF_COMM_WORLD)\n", + caf_this_image, caf_num_images, __FUNCTION__); + } + + MPI_Errhandler_free (&failed_stopped_CAF_COMM_WORLD_mpi_errorhandler); + + /* Only call Finalize if CAF runtime Initialized MPI. */ + if (caf_owns_mpi) + MPI_Finalize (); +#else + MPI_Comm_free (&CAF_COMM_WORLD); + CAF_Win_unlock_all (*stat_tok); MPI_Win_free (stat_tok); - MPI_Comm_free(&CAF_COMM_WORLD); /* Only call Finalize if CAF runtime Initialized MPI. */ - if (caf_owns_mpi) { - MPI_Finalize(); - } - pthread_mutex_lock(&lock_am); + if (caf_owns_mpi) + MPI_Finalize (); +#endif + + pthread_mutex_lock (&lock_am); caf_is_finalized = 1; - pthread_mutex_unlock(&lock_am); + pthread_mutex_unlock (&lock_am); free (sync_handles); + dprint ("%d/%d: %s: Finalisation done!!!\n", caf_this_image, caf_num_images, + __FUNCTION__); } +/* Finalize coarray program. */ + +void +PREFIX (finalize) (void) +{ + finalize_internal (0); +} + +/* TODO: This is interface is violating the F2015 standard, but not the gfortran + * API. Fix it (the fortran API). */ int -PREFIX (this_image)(int distance __attribute__ ((unused))) +PREFIX (this_image) (int distance __attribute__ ((unused))) { return caf_this_image; } - +/* TODO: This is interface is violating the F2015 standard, but not the gfortran + * API. Fix it (the fortran API). */ int -PREFIX (num_images)(int distance __attribute__ ((unused)), - int failed __attribute__ ((unused))) +PREFIX (num_images) (int distance __attribute__ ((unused)), + int failed __attribute__ ((unused))) { return caf_num_images; } @@ -812,7 +973,7 @@ PREFIX (register) (size_t size, caf_register_t type, caf_token_t *token, *token = malloc (sizeof (mpi_caf_token_t)); mpi_token = (mpi_caf_token_t *) *token; - p = TOKEN(mpi_token); + p = TOKEN (mpi_token); if ((type == CAF_REGTYPE_COARRAY_ALLOC_ALLOCATE_ONLY || type == CAF_REGTYPE_COARRAY_ALLOC @@ -905,24 +1066,14 @@ PREFIX (register) (size_t size, caf_register_t type, caf_token_t *token, } } #else // GCC_GE_7 -#ifdef COMPILER_SUPPORTS_CAF_INTRINSICS void * - _gfortran_caf_register (size_t size, caf_register_t type, caf_token_t *token, - int *stat, char *errmsg, int errmsg_len) -#else -void * - PREFIX (register) (size_t size, caf_register_t type, caf_token_t *token, - int *stat, char *errmsg, int errmsg_len) -#endif +PREFIX (register) (size_t size, caf_register_t type, caf_token_t *token, + int *stat, char *errmsg, int errmsg_len) { /* int ierr; */ void *mem; size_t actual_size; int l_var=0, *init_array = NULL; -#ifdef WITH_FAILED_IMAGES - int ierr = 0, flag = 0; - MPI_Win *stopped_win; -#endif if (unlikely (caf_is_finalized)) goto error; @@ -937,9 +1088,6 @@ void * /* Token contains only a list of pointers. */ *token = malloc (sizeof(MPI_Win)); -#ifdef WITH_FAILED_IMAGES - stopped_win = (MPI_Win *)malloc(sizeof(MPI_Win)); -#endif MPI_Win *p = *token; if(type == CAF_REGTYPE_LOCK_STATIC || type == CAF_REGTYPE_LOCK_ALLOC || @@ -952,16 +1100,6 @@ void * else actual_size = size; -#ifdef WITH_FAILED_IMAGES - MPI_Barrier(CAF_COMM_WORLD); - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - } -#endif - #if MPI_VERSION >= 3 MPI_Win_allocate(actual_size, 1, mpi_info_same_size, CAF_COMM_WORLD, &mem, p); CAF_Win_lock_all (*p); @@ -970,16 +1108,6 @@ void * MPI_Win_create(mem, actual_size, 1, MPI_INFO_NULL, CAF_COMM_WORLD, p); #endif // MPI_VERSION -#ifdef WITH_FAILED_IMAGES - /* MPI_Win_set_errhandler(*p,errh_w); */ - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - } -#endif - if(l_var) { init_array = (int *)calloc(size, sizeof(int)); @@ -988,26 +1116,13 @@ void * 0, size, MPI_INT, *p); CAF_Win_unlock(caf_this_image - 1, *p); free(init_array); - /* PREFIX(sync_all) (NULL,NULL,0); */ } -#ifdef WITH_FAILED_IMAGES - if(error_called == 1) - { - MPIX_Comm_revoke(CAF_COMM_WORLD); - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - } -#endif - PREFIX(sync_all) (NULL,NULL,0); caf_static_t *tmp = malloc (sizeof (caf_static_t)); tmp->prev = caf_tot; tmp->token = *token; -#ifdef WITH_FAILED_IMAGES - tmp->stopped_token = stopped_win; -#endif caf_tot = tmp; if (type == CAF_REGTYPE_COARRAY_STATIC) @@ -1020,11 +1135,6 @@ void * if (stat) *stat = 0; -#ifdef WITH_FAILED_IMAGES - else if (ierr == STAT_FAILED_IMAGE) - error_stop (ierr); -#endif - return mem; error: @@ -1065,8 +1175,6 @@ void PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len) #endif { - /* int ierr; */ - if (unlikely (caf_is_finalized)) { const char msg[] = "Failed to deallocate coarray - " @@ -1089,14 +1197,7 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len } #ifdef WITH_FAILED_IMAGES - MPI_Barrier(CAF_COMM_WORLD); - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - MPI_Barrier(CAF_COMM_WORLD); - } + MPI_Barrier (CAF_COMM_WORLD); #else PREFIX (sync_all) (NULL, NULL, 0); #endif @@ -1104,41 +1205,41 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len caf_static_t *tmp = caf_tot, *prev = caf_tot, *next=caf_tot; MPI_Win *p; - while(tmp) + while (tmp) { prev = tmp->prev; - if(tmp->token == *token) + if (tmp->token == *token) { - p = TOKEN(*token); - CAF_Win_unlock_all(*p); + p = TOKEN (*token); + CAF_Win_unlock_all (*p); #ifdef GCC_GE_7 mpi_caf_token_t *mpi_token = *(mpi_caf_token_t **)token; if (mpi_token->local_memptr) { - MPI_Win_free(p); + MPI_Win_free (p); mpi_token->local_memptr = NULL; } if ((*(mpi_caf_token_t **)token)->desc && type != CAF_DEREGTYPE_COARRAY_DEALLOCATE_ONLY) { - CAF_Win_unlock_all(*(mpi_token->desc)); + CAF_Win_unlock_all (*(mpi_token->desc)); MPI_Win_free (mpi_token->desc); free (mpi_token->desc); } #else - MPI_Win_free(p); + MPI_Win_free (p); #endif - if(prev) + if (prev) next->prev = prev->prev; else next->prev = NULL; - if(tmp == caf_tot) + if (tmp == caf_tot) caf_tot = prev; - free(tmp); + free (tmp); break; } @@ -1153,10 +1254,12 @@ PREFIX (deregister) (caf_token_t *token, int *stat, char *errmsg, int errmsg_len } void -PREFIX (sync_memory) (int *stat, char *errmsg, int errmsg_len) +PREFIX (sync_memory) (int *stat __attribute__ ((unused)), + char *errmsg __attribute__ ((unused)), + int errmsg_len __attribute__ ((unused))) { #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK) - explicit_flush(); + explicit_flush (); #endif } @@ -1164,38 +1267,39 @@ PREFIX (sync_memory) (int *stat, char *errmsg, int errmsg_len) void PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) { - int ierr = 0, flag = 0; + int ierr = 0; if (unlikely (caf_is_finalized)) ierr = STAT_STOPPED_IMAGE; else { + int mpi_err; #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK) explicit_flush(); #endif - MPI_Barrier(CAF_COMM_WORLD); - } #ifdef WITH_FAILED_IMAGES - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - MPI_Barrier(CAF_COMM_WORLD); - } + mpi_err = MPI_Barrier (alive_comm); +#else + mpi_err = MPI_Barrier (CAF_COMM_WORLD); #endif + dprint ("%d/%d: %s: MPI_Barrier = %d.\n", caf_this_image, caf_num_images, + __FUNCTION__, mpi_err); + if (mpi_err == STAT_FAILED_IMAGE) + ierr = STAT_FAILED_IMAGE; + else if (mpi_err != 0) + MPI_Error_class (mpi_err, &ierr); + } - if (stat) + if (stat != NULL) *stat = ierr; #ifdef WITH_FAILED_IMAGES - else if(ierr == STAT_FAILED_IMAGE) - error_stop (ierr); - - if (ierr != 0 && ierr != STAT_FAILED_IMAGE) -#else - if (ierr) + else if (ierr == STAT_FAILED_IMAGE) + /* F2015 requests stat to be set for FAILED IMAGES, else error out. */ + terminate_internal (ierr, 0); #endif + + if (ierr != 0 && ierr != STAT_FAILED_IMAGE) { char *msg; if (caf_is_finalized) @@ -1211,7 +1315,7 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) if (errmsg_len > len) memset (&errmsg[len], ' ', errmsg_len-len); } - else + else if (stat == NULL) caf_runtime_error (msg); } } @@ -1223,45 +1327,43 @@ PREFIX (sync_all) (int *stat, char *errmsg, int errmsg_len) /* size: The number of bytes to be transferred. */ /* asynchronous: Return before the data transfer has been complete */ -void selectType(int size, MPI_Datatype *dt) +void selectType (int size, MPI_Datatype *dt) { int t_s; - MPI_Type_size(MPI_INT, &t_s); + MPI_Type_size (MPI_INT, &t_s); - if(t_s==size) + if (t_s == size) { - *dt=MPI_INT; + *dt = MPI_INT; return; } - MPI_Type_size(MPI_DOUBLE, &t_s); + MPI_Type_size (MPI_DOUBLE, &t_s); - if(t_s==size) + if (t_s == size) { - *dt=MPI_DOUBLE; + *dt = MPI_DOUBLE; return; } - MPI_Type_size(MPI_COMPLEX, &t_s); + MPI_Type_size (MPI_COMPLEX, &t_s); - if(t_s==size) + if (t_s == size) { - *dt=MPI_COMPLEX; + *dt = MPI_COMPLEX; return; } - MPI_Type_size(MPI_DOUBLE_COMPLEX, &t_s); + MPI_Type_size (MPI_DOUBLE_COMPLEX, &t_s); - if(t_s==size) + if (t_s == size) { - *dt=MPI_DOUBLE_COMPLEX; + *dt = MPI_DOUBLE_COMPLEX; return; } - } -/* Not yet adapted for failed images */ void PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s, gfc_descriptor_t *dest, @@ -1269,7 +1371,7 @@ PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s, caf_token_t token_g, size_t offset_g, int image_index_g, gfc_descriptor_t *src , caf_vector_t *src_vector __attribute__ ((unused)), - int src_kind, int dst_kind, bool mrt) + int src_kind, int dst_kind, bool mrt, int *stat) { int ierr = 0; size_t i, size; @@ -1295,6 +1397,9 @@ PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s, if (size == 0) return; + check_image_health (image_index_s, stat); + check_image_health (image_index_g, stat); + if (rank == 0 || (GFC_DESCRIPTOR_TYPE (dest) == GFC_DESCRIPTOR_TYPE (src) && dst_kind == src_kind && GFC_DESCRIPTOR_RANK (src) != 0 @@ -1324,7 +1429,7 @@ PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s, CAF_Win_unlock (image_index_s - 1, *p_s); if (ierr != 0) - error_stop (ierr); + terminate_internal (ierr, 0); return; free(tmp); @@ -1390,7 +1495,7 @@ PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s, if (ierr != 0) { - error_stop (ierr); + terminate_internal (ierr, 0); return; } } @@ -1399,6 +1504,7 @@ PREFIX (sendget) (caf_token_t token_s, size_t offset_s, int image_index_s, } + /* Send array data from src to dest on a remote image. */ /* The last argument means may_require_temporary */ @@ -1437,21 +1543,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, if (size == 0) return; -#ifdef WITH_FAILED_IMAGES - for(j=0;j src_size) { @@ -1518,25 +1610,10 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, } #ifdef WITH_FAILED_IMAGES - /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - /* communicator_shrink(&lock_comm); */ - error_called = 0; - fake_error_called = 0; - ierr = STAT_FAILED_IMAGE; - } - - if(!stat && ierr == STAT_FAILED_IMAGE) - error_stop (ierr); - - if(stat) - *stat = ierr; + check_image_health (image_index , stat); #else if (ierr != 0) - error_stop (ierr); + terminate_internal (ierr, 0); #endif return; } @@ -1596,7 +1673,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, ptrdiff_t array_offset_sr = 0; stride = 1; extent = 1; - tot_ext = 1; + tot_ext = 1; for (j = 0; j < GFC_DESCRIPTOR_RANK (src)-1; j++) { array_offset_sr += ((i / tot_ext) @@ -1605,7 +1682,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, * src->dim[j]._stride; extent = (src->dim[j]._ubound - src->dim[j].lower_bound + 1); stride = src->dim[j]._stride; - tot_ext *= extent; + tot_ext *= extent; } array_offset_sr += (i / tot_ext) * src->dim[rank-1]._stride; @@ -1633,33 +1710,22 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, CAF_Win_unlock (image_index - 1, *p); #ifdef WITH_FAILED_IMAGES - /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } - + check_image_health (image_index, stat); + if(!stat && ierr == STAT_FAILED_IMAGE) - error_stop (ierr); + error_stop (ierr); if(stat) - *stat = ierr; - - MPI_Type_free (&dt_s); - MPI_Type_free (&dt_d); + *stat = ierr; #else if (ierr != 0) { error_stop (ierr); return; } - +#endif MPI_Type_free (&dt_s); MPI_Type_free (&dt_d); -#endif #else if(caf_this_image == image_index && mrt) @@ -1736,7 +1802,7 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, #ifndef WITH_FAILED_IMAGES if (ierr != 0) { - error_stop (ierr); + caf_runtime_error ("MPI Error: %d", ierr); return; } #endif @@ -1774,22 +1840,8 @@ PREFIX (send) (caf_token_t token, size_t offset, int image_index, } CAF_Win_unlock (image_index - 1, *p); #endif -#ifdef WITH_FAILED_IMAGES - /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } - - if(!stat && ierr == STAT_FAILED_IMAGE) - error_stop (ierr); - - if(stat) - *stat = ierr; -#endif + + check_image_health (image_index, stat); } } @@ -1805,7 +1857,7 @@ PREFIX (get) (caf_token_t token, size_t offset, bool mrt, int *stat) { size_t i, size; - int ierr = 0, j, flag; + int ierr = 0, j; MPI_Win *p = TOKEN(token); int rank = GFC_DESCRIPTOR_RANK (src); size_t src_size = GFC_DESCRIPTOR_SIZE (src); @@ -1827,21 +1879,7 @@ PREFIX (get) (caf_token_t token, size_t offset, if (size == 0) return; -#ifdef WITH_FAILED_IMAGES - for(j=0;j src_size) { @@ -1877,27 +1915,11 @@ PREFIX (get) (caf_token_t token, size_t offset, memcpy ((char *) dest->base_addr + src_size, pad_str, dst_size-src_size); CAF_Win_unlock (image_index - 1, *p); -#ifdef WITH_FAILED_IMAGES - /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } - if(!stat && ierr == STAT_FAILED_IMAGE) - error_stop (ierr); - - if(stat) - *stat = ierr; - } -#else + check_image_health (image_index, stat); } if (ierr != 0) - error_stop (ierr); -#endif + terminate_internal (ierr, 0); return; } @@ -1989,15 +2011,8 @@ PREFIX (get) (caf_token_t token, size_t offset, CAF_Win_lock (MPI_LOCK_SHARED, image_index - 1, *p); ierr = MPI_Get (dst, 1, dt_d, image_index-1, offset, 1, dt_s, *p); #ifdef WITH_FAILED_IMAGES - /* MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); */ + check_image_health (image_index, stat); - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } - if(stat) *stat = ierr; else if(ierr == STAT_FAILED_IMAGE) @@ -2083,7 +2098,7 @@ PREFIX (get) (caf_token_t token, size_t offset, memcpy ((char *) dst + src_size, pad_str, dst_size-src_size); } if (ierr != 0) - error_stop (ierr); + terminate_internal (ierr, 0); } if(caf_this_image == image_index && mrt) @@ -2913,6 +2928,8 @@ _gfortran_caf_get_by_ref (caf_token_t token, int image_index, if (stat) *stat = 0; + check_image_health (image_index, stat); + GET_REMOTE_DESC (mpi_token, src, primary_src_desc_data, image_index - 1); /* Compute the size of the result. In the beginning size just counts the number of elements. */ @@ -3325,8 +3342,9 @@ PREFIX(send_by_ref) (caf_token_t token, int image_index, int dst_kind, int src_kind, bool may_require_tmp, bool dst_reallocatable, int *stat) { - fprintf (stderr, "COARRAY ERROR: caf_send_by_ref() not implemented yet "); - error_stop (1); + unimplemented_alloc_comps_message("caf_send_by_ref()"); + // Make sure we exit + terminate_internal (1, 1); } @@ -3337,23 +3355,26 @@ PREFIX(sendget_by_ref) (caf_token_t dst_token, int dst_image_index, int dst_kind, int src_kind, bool may_require_tmp, int *dst_stat, int *src_stat) { - fprintf (stderr, "COARRAY ERROR: caf_sendget_by_ref() not implemented yet "); - error_stop (1); + unimplemented_alloc_comps_message("caf_sendget_by_ref()"); + // Make sure we exit + terminate_internal (1, 1); } int PREFIX(is_present) (caf_token_t token, int image_index, caf_reference_t *refs) { - fprintf (stderr, "COARRAY ERROR: caf_is_present() not implemented yet "); - error_stop (1); + unimplemented_alloc_comps_message("caf_is_present()"); + // Make sure we exit + terminate_internal (1, 1); } #endif /* SYNC IMAGES. Note: SYNC IMAGES(*) is passed as count == -1 while SYNC IMAGES([]) has count == 0. Note further that SYNC IMAGES(*) - is not equivalent to SYNC ALL. */ + is not semantically equivalent to SYNC ALL. */ + void PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg, int errmsg_len) @@ -3361,17 +3382,25 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg, sync_images_internal (count, images, stat, errmsg, errmsg_len, false); } -void +static void sync_images_internal (int count, int images[], int *stat, char *errmsg, int errmsg_len, bool internal) { int ierr = 0, i = 0, j = 0, int_zero = 0, done_count = 0; MPI_Status s; +#ifdef WITH_FAILED_IMAGES + no_stopped_images_check_in_errhandler = true; +#endif + dprint ("%d/%d: Entering %s.\n", caf_this_image, caf_num_images, __FUNCTION__); if (count == 0 || (count == 1 && images[0] == caf_this_image)) { if (stat) *stat = 0; +#ifdef WITH_FAILED_IMAGES + no_stopped_images_check_in_errhandler = false; +#endif + dprint ("%d/%d: Leaving %s early.\n", caf_this_image, caf_num_images, __FUNCTION__); return; } @@ -3404,21 +3433,21 @@ sync_images_internal (int count, int images[], int *stat, char *errmsg, { if(count == -1) { - for (i = 0; i < caf_num_images - 1; ++i) - ++orders[images_full[i] - 1]; count = caf_num_images - 1; images = images_full; } - else - { - for (i = 0; i < count; ++i) - ++orders[images[i] - 1]; - } #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK) explicit_flush(); #endif +#ifdef WITH_FAILED_IMAGES + { + int flag; + /* Provoke detecting process fails. */ + MPI_Test (&alive_request, &flag, MPI_STATUS_IGNORE); + } +#endif /* A rather simple way to synchronice: - expect all images to sync with receiving an int, - on the other side, send all processes to sync with an int, @@ -3439,18 +3468,19 @@ sync_images_internal (int count, int images[], int *stat, char *errmsg, also have reached a sync images statement. This implementation makes no assumption when the image continues or in which order synced images continue. */ - for(i = 0; i < count; ++i) + for (i = 0; i < count; ++i) /* Need to have the request handlers contigously in the handlers array or waitany below will trip about the handler as illegal. */ - ierr = MPI_Irecv (&arrived[images[i] - 1], 1, MPI_INT, images[i] - 1, 0, - CAF_COMM_WORLD, &sync_handles[i]); - for(i = 0; i < count; ++i) - MPI_Send (&int_zero, 1, MPI_INT, images[i] - 1, 0, CAF_COMM_WORLD); + ierr = MPI_Irecv (&arrived[images[i] - 1], 1, MPI_INT, images[i] - 1, + MPI_TAG_CAF_SYNC_IMAGES, CAF_COMM_WORLD, &sync_handles[i]); + for (i = 0; i < count; ++i) + MPI_Send (&int_zero, 1, MPI_INT, images[i] - 1, MPI_TAG_CAF_SYNC_IMAGES, + CAF_COMM_WORLD); done_count = 0; while (done_count < count) { ierr = MPI_Waitany (count, sync_handles, &i, &s); - if (i != MPI_UNDEFINED) + if (ierr == MPI_SUCCESS && i != MPI_UNDEFINED) { ++done_count; if (ierr == MPI_SUCCESS && arrived[s.MPI_SOURCE] == STAT_STOPPED_IMAGE) @@ -3464,29 +3494,40 @@ sync_images_internal (int count, int images[], int *stat, char *errmsg, } } else if (ierr != MPI_SUCCESS) - /* Abort receives here, too, when implemented above. */ - break; - } - #ifdef WITH_FAILED_IMAGES - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } + { + int err; + MPI_Error_class (ierr, &err); + if (err == MPIX_ERR_PROC_FAILED) + { + int flag; + dprint ("%d/%d: Image failed, provoking error handling.\n", + caf_this_image, caf_num_images); + ierr = STAT_FAILED_IMAGE; + /* Provoke detecting process fails. */ + MPI_Test (&alive_request, &flag, MPI_STATUS_IGNORE); + } + break; + } +#else + break; #endif + } } sync_images_err_chk: +#ifdef WITH_FAILED_IMAGES + no_stopped_images_check_in_errhandler = false; +#endif + dprint ("%d/%d: Leaving %s.\n", caf_this_image, caf_num_images, __FUNCTION__); if (stat) *stat = ierr; #ifdef WITH_FAILED_IMAGES - else if(ierr == STAT_FAILED_IMAGE) - error_stop (ierr); + else if (ierr == STAT_FAILED_IMAGE) + terminate_internal (ierr, 0); #endif - if (ierr && stat == NULL) + if (ierr != 0 && ierr != STAT_FAILED_IMAGE) { char *msg; if (caf_is_finalized) @@ -3502,7 +3543,7 @@ sync_images_internal (int count, int images[], int *stat, char *errmsg, if (errmsg_len > len) memset (&errmsg[len], ' ', errmsg_len-len); } - else if (!internal) + else if (!internal && stat == NULL) caf_runtime_error (msg); } } @@ -4043,7 +4084,7 @@ PREFIX (atomic_define) (caf_token_t token, size_t offset, if (stat) *stat = ierr; else if (ierr != 0) - error_stop (ierr); + terminate_internal (ierr, 0); return; } @@ -4079,7 +4120,7 @@ PREFIX(atomic_ref) (caf_token_t token, size_t offset, if (stat) *stat = ierr; else if (ierr != 0) - error_stop (ierr); + terminate_internal (ierr, 0); return; } @@ -4117,7 +4158,7 @@ PREFIX(atomic_cas) (caf_token_t token, size_t offset, if (stat) *stat = ierr; else if (ierr != 0) - error_stop (ierr); + terminate_internal (ierr, 0); return; } @@ -4173,7 +4214,7 @@ PREFIX (atomic_op) (int op, caf_token_t token , if (stat) *stat = ierr; else if (ierr != 0) - error_stop (ierr); + terminate_internal (ierr, 0); return; } @@ -4206,19 +4247,10 @@ PREFIX (event_post) (caf_token_t token, size_t index, printf ("Events for MPI-2 are not supported, please update your MPI implementation\n"); #endif // MPI_VERSION -#ifdef WITH_FAILED_IMAGES - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } -#endif + check_image_health (image_index, stat); if(!stat && ierr == STAT_FAILED_IMAGE) - error_stop (ierr); + terminate_internal (ierr, 0); if(ierr != MPI_SUCCESS) { @@ -4275,19 +4307,10 @@ PREFIX (event_wait) (caf_token_t token, size_t index, ierr = MPI_Fetch_and_op(&newval, &old, MPI_INT, image, index*sizeof(int), MPI_SUM, *p); CAF_Win_unlock (image, *p); -#ifdef WITH_FAILED_IMAGES - MPI_Test(&lock_req,&flag,MPI_STATUS_IGNORE); - - if(error_called == 1) - { - communicator_shrink(&CAF_COMM_WORLD); - error_called = 0; - ierr = STAT_FAILED_IMAGE; - } -#endif + check_image_health (image, stat); if(!stat && ierr == STAT_FAILED_IMAGE) - error_stop (ierr); + terminate_internal (ierr, 0); if(ierr != MPI_SUCCESS) { @@ -4328,28 +4351,39 @@ PREFIX (event_query) (caf_token_t token, size_t index, *stat = ierr; } -/* ERROR STOP the other images. */ + +/* Internal function to execute the part that is common to all (error) stop + * functions. */ static void -error_stop (int error) +terminate_internal (int stat_code, int exit_code) { - /* FIXME: Shutdown the Fortran RTL to flush the buffer. PR 43849. */ - /* FIXME: Do some more effort than just gasnet_exit(). */ - MPI_Abort(CAF_COMM_WORLD, error); + dprint ("%d/%d: terminate_internal (stat_code = %d, exit_code = %d).\n", + caf_this_image, caf_num_images, stat_code, exit_code); + finalize_internal (stat_code); - /* Should be unreachable, but to make sure also call exit. */ - exit (error); +#ifndef WITH_FAILED_IMAGES + MPI_Abort(MPI_COMM_WORLD, exit_code); +#endif + exit (exit_code); } + /* STOP function for integer arguments. */ + void PREFIX (stop_numeric) (int32_t stop_code) { fprintf (stderr, "STOP %d\n", stop_code); - PREFIX (finalize) (); + + /* Stopping includes taking down the runtime regularly and returning the + * stop_code. */ + terminate_internal (STAT_STOPPED_IMAGE, stop_code); } + /* STOP function for string arguments. */ + void PREFIX (stop_str) (const char *string, int32_t len) { @@ -4358,9 +4392,11 @@ PREFIX (stop_str) (const char *string, int32_t len) fputc (*(string++), stderr); fputs ("\n", stderr); - PREFIX (finalize) (); + /* Stopping includes taking down the runtime regularly. */ + terminate_internal (STAT_STOPPED_IMAGE, 0); } + /* ERROR STOP function for string arguments. */ void @@ -4371,7 +4407,7 @@ PREFIX (error_stop_str) (const char *string, int32_t len) fputc (*(string++), stderr); fputs ("\n", stderr); - error_stop (1); + terminate_internal (STAT_STOPPED_IMAGE, 1); } @@ -4381,33 +4417,72 @@ void PREFIX (error_stop) (int32_t error) { fprintf (stderr, "ERROR STOP %d\n", error); - error_stop (error); + + terminate_internal (STAT_STOPPED_IMAGE, error); } + +/* FAIL IMAGE statement. */ + void PREFIX (fail_image) (void) { -#ifdef WITH_FAILED_IMAGES - // TODO -#else fputs ("IMAGE FAILED!\n", stderr); - exit (0); -#endif + + raise(SIGKILL); + /* A failing image is expected to take down the runtime regularly. */ + terminate_internal (STAT_FAILED_IMAGE, 0); } int PREFIX (image_status) (int image) { +#ifdef GFC_CAF_CHECK + if (image < 1 || image > caf_num_images) + { + char errmsg[60]; + sprintf (errmsg, "Image #%d out of bounds of images 1..%d.", image, + caf_num_images); + caf_runtime_error (errmsg); + } +#endif #ifdef WITH_FAILED_IMAGES - int i; - - for(i = 0; i < n_failed_imgs; ++i) - if(image == failed_images_array[i]) - return STAT_FAILED_IMAGE; - if(stopped_imgs[image - 1] != 0) - return STAT_STOPPED_IMAGE; + if (image_stati[image - 1] == 0) + { + int status, ierr; + /* Check that we are fine before doing anything. + * + * Do an MPI-operation to learn about failed/stopped images, that have + * not been detected yet. */ + ierr = MPI_Test (&alive_request, &status, MPI_STATUSES_IGNORE); + MPI_Error_class (ierr, &status); + if (ierr == MPI_SUCCESS) + { + CAF_Win_lock (MPI_LOCK_SHARED, image - 1, *stat_tok); + ierr = MPI_Get (&status, 1, MPI_INT, image - 1, 0, 1, MPI_INT, *stat_tok); + dprint ("%d/%d: Image status of image #%d is: %d\n", caf_this_image, + caf_num_images, image, status); + CAF_Win_unlock (image - 1, *stat_tok); + image_stati[image - 1] = status; + } + else if (status == MPIX_ERR_PROC_FAILED) + image_stati[image - 1] = STAT_FAILED_IMAGE; + else + { + const int strcap = 200; + char errmsg[strcap]; + int slen, supplied_len; + sprintf (errmsg, "Image status for image #%d returned mpi error: ", + image); + slen = strlen (errmsg); + supplied_len = strcap - slen; + MPI_Error_string (status, &errmsg[slen], &supplied_len); + caf_runtime_error (errmsg); + } + } + return image_stati[image - 1]; #else - fputs("IMAGE_STATUS() support unavailable in this build.\n", stderr); + unsupported_fail_images_message ("IMAGE_STATUS()"); #endif return 0; @@ -4420,42 +4495,40 @@ PREFIX (failed_images) (gfc_descriptor_t *array, int team __attribute__ ((unused int local_kind = kind ? *kind : 4; /* GFC_DEFAULT_INTEGER_KIND = 4*/ #ifdef WITH_FAILED_IMAGES - void *srcIt, *dstIt; - void *mem = calloc(n_failed_imgs, local_kind); + void *mem = calloc (num_images_failed, local_kind); array->base_addr = mem; - dstIt = mem; - srcIt = failed_images_array; - for (int i = 0; i < n_failed_imgs; ++i) + for (int i = 0; i < caf_num_images; ++i) { - switch (local_kind) - { - case 1: - *(int8_t *)dstIt = *(int32_t *)scrIt; - break; - case 2: - *(int16_t *)dstIt = *(int32_t *)scrIt; - break; - case 4: - *(int32_t *)dstIt = *(int32_t *)scrIt; - break; - case 8: - *(int64_t *)dstIt = *(int32_t *)scrIt; - break; + if (image_stati[i] == STAT_FAILED_IMAGE) + { + switch (local_kind) + { + case 1: + *(int8_t *)mem = i + 1; + break; + case 2: + *(int16_t *)mem = i + 1; + break; + case 4: + *(int32_t *)mem = i + 1; + break; + case 8: + *(int64_t *)mem = i + 1; + break; #ifdef HAVE_GFC_INTEGER_16 - case 16: - *(int128t *)dstIt = *(int32_t *)scrIt; - break; + case 16: + *(int128t *)mem = i + 1; + break; #endif - default: - caf_runtime_error("Unsupported integer kind %1 in caf_failed_images.", local_kind); - } - dstIt += local_kind; - srcIt += sizeof(int); + default: + caf_runtime_error("Unsupported integer kind %1 in caf_failed_images.", local_kind); + } + mem += local_kind; + } } - qsort(mem, n_failed_imgs, local_kind, cmpfunc); - array->dim[0]._ubound = n_failed_imgs-1; + array->dim[0]._ubound = num_images_failed-1; #else - fputs("FAILED_IMAGES() support unavailable in this build.\n", stderr); + unsupported_fail_images_message ("FAILED_IMAGES()"); array->dim[0]._ubound = -1; array->base_addr = NULL; #endif @@ -4473,42 +4546,40 @@ PREFIX (stopped_images) (gfc_descriptor_t *array, int team __attribute__ ((unuse int local_kind = kind ? *kind : 4; /* GFC_DEFAULT_INTEGER_KIND = 4*/ #ifdef WITH_FAILED_IMAGES - void *srcIt, *dstIt; - void *mem = calloc(n_stopped_imgs, local_kind); + void *mem = calloc (num_images_stopped, local_kind); array->base_addr = mem; - dstIt = mem; - srcIt = stopped_images; - for (int i = 0; i < n_stopped_imgs; ++i) + for (int i = 0; i < caf_num_images; ++i) { - switch (local_kind) - { - case 1: - *(int8_t *)dstIt = *(int32_t *)scrIt; - break; - case 2: - *(int16_t *)dstIt = *(int32_t *)scrIt; - break; - case 4: - *(int32_t *)dstIt = *(int32_t *)scrIt; - break; - case 8: - *(int64_t *)dstIt = *(int32_t *)scrIt; - break; + if (image_stati[i]) + { + switch (local_kind) + { + case 1: + *(int8_t *)mem = i + 1; + break; + case 2: + *(int16_t *)mem = i + 1; + break; + case 4: + *(int32_t *)mem = i + 1; + break; + case 8: + *(int64_t *)mem = i + 1; + break; #ifdef HAVE_GFC_INTEGER_16 - case 16: - *(int128t *)dstIt = *(int32_t *)scrIt; - break; + case 16: + *(int128t *)mem = i + 1; + break; #endif - default: - caf_runtime_error("Unsupported integer kind %1 in caf_stopped_images.", local_kind); - } - dstIt += local_kind; - srcIt += sizeof(int); + default: + caf_runtime_error("Unsupported integer kind %1 in caf_stopped_images.", local_kind); + } + mem += local_kind; + } } - qsort(mem, n_stopped_imgs, local_kind, cmpfunc); - array->dim[0]._ubound = n_failed_imgs-1; + array->dim[0]._ubound = num_images_stopped - 1; #else - fputs("STOPPED_IMAGES() support unavailable in this build.\n", stderr); + unsupported_fail_images_message ("STOPPED_IMAGES()"); array->dim[0]._ubound = -1; array->base_addr = NULL; #endif @@ -4518,3 +4589,31 @@ PREFIX (stopped_images) (gfc_descriptor_t *array, int team __attribute__ ((unuse array->dim[0]._stride = 1; array->offset = 0; } + +/* Give a descriptive message when failed images support is not available. */ +void +unsupported_fail_images_message (const char * functionname) +{ + fprintf (stderr, "*** caf_mpi-lib runtime message on image %d:\n" + "*** The failed images feature '%s' of Fortran 2015 standard\n" + "*** is not available in this build. You need a compiler with failed images\n" + "*** support activated and compile OpenCoarrays with failed images support.\n", + caf_this_image, functionname); +#ifdef STOP_ON_UNSUPPORTED + exit (EXIT_FAILURE); +#endif +} + +/* Give a descriptive message when support for an allocatable components feature + * is not available. */ +void +unimplemented_alloc_comps_message (const char * functionname) +{ + fprintf (stderr, "*** caf_mpi-lib runtime message on image %d:\n" + "*** The allocatable components feature '%s' of Fortran 2008 standard\n" + "*** is not yet supported by OpenCoarrays.\n", + caf_this_image, functionname); +#ifdef STOP_ON_UNSUPPORTED + exit (EXIT_FAILURE); +#endif +} diff --git a/src/tests/unit/CMakeLists.txt b/src/tests/unit/CMakeLists.txt index 96bff9f29..9a6a6e305 100644 --- a/src/tests/unit/CMakeLists.txt +++ b/src/tests/unit/CMakeLists.txt @@ -4,7 +4,7 @@ if (${opencoarrays_aware_compiler}) add_subdirectory(init_register) add_subdirectory(collectives) add_subdirectory(sync) - add_subdirectory(image_states) + add_subdirectory(fail_images) else() add_subdirectory(extensions) endif() diff --git a/src/tests/unit/fail_images/CMakeLists.txt b/src/tests/unit/fail_images/CMakeLists.txt new file mode 100644 index 000000000..0bc5d0162 --- /dev/null +++ b/src/tests/unit/fail_images/CMakeLists.txt @@ -0,0 +1,27 @@ +add_executable(image_fail_test_1 image_fail_test_1.f90) +target_link_libraries(image_fail_test_1 OpenCoarrays) + +add_executable(image_status_test_1 image_status_test_1.f90) +target_link_libraries(image_status_test_1 OpenCoarrays) + +add_executable(image_fail_and_sync_test_1 image_fail_and_sync_test_1.f90) +target_link_libraries(image_fail_and_sync_test_1 OpenCoarrays) + +add_executable(image_fail_and_sync_test_2 image_fail_and_sync_test_2.f90) +target_link_libraries(image_fail_and_sync_test_2 OpenCoarrays) + +add_executable(image_fail_and_sync_test_3 image_fail_and_sync_test_3.f90) +target_link_libraries(image_fail_and_sync_test_3 OpenCoarrays) + +add_executable(image_fail_and_status_test_1 image_fail_and_status_test_1.f90) +target_link_libraries(image_fail_and_status_test_1 OpenCoarrays) + +add_executable(image_fail_and_get_test_1 image_fail_and_get_test_1.f90) +target_link_libraries(image_fail_and_get_test_1 OpenCoarrays) + +add_executable(image_fail_and_failed_images_test_1 image_fail_and_failed_images_test_1.f90) +target_link_libraries(image_fail_and_failed_images_test_1 OpenCoarrays) + +add_executable(image_fail_and_stopped_images_test_1 image_fail_and_stopped_images_test_1.f90) +target_link_libraries(image_fail_and_stopped_images_test_1 OpenCoarrays) + diff --git a/src/tests/unit/fail_images/image_fail_and_failed_images_test_1.f90 b/src/tests/unit/fail_images/image_fail_and_failed_images_test_1.f90 new file mode 100644 index 000000000..73093e83a --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_and_failed_images_test_1.f90 @@ -0,0 +1,31 @@ +! Check that after an image has failed the failed_images function returns the +! correct indices. +! Image two is to fail, all others to continue. + +program image_fail_and_failed_images_test_1 + use iso_fortran_env , only : STAT_FAILED_IMAGE + implicit none + integer :: i, stat + integer, allocatable :: fimages(:) + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + do i= 1, np + if (image_status(i) /= 0) error stop "image_status(i) should not fail" + end do + + ! Need a sync here to make sure all images are started and to prevent image fail + ! is not already detected in above image_status(i). + sync all + if (me == 2) fail image + sync all (STAT=stat) + + fimages = failed_images() + if (size(fimages) /= 1) error stop "failed_images()'s size should be one." + if (fimages(1) /= 2) error stop "The second image should have failed." + + if (me == 1) print *,"Test passed." + end associate + +end program image_fail_and_failed_images_test_1 + diff --git a/src/tests/unit/fail_images/image_fail_and_get_test_1.f90 b/src/tests/unit/fail_images/image_fail_and_get_test_1.f90 new file mode 100644 index 000000000..8672f37be --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_and_get_test_1.f90 @@ -0,0 +1,32 @@ +! Check that after an image has failed a get to other images is still possible. +! Image two is to fail, all others to continue. + +program image_fail_and_get_test_1 + use iso_fortran_env , only : STAT_FAILED_IMAGE + implicit none + integer :: i, stat + integer, save :: share[*] + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + + share = 37 + + sync all + if (me == 2) fail image + sync all (STAT=stat) + + print *, "Checking shared value." + do i= 1, np + if (i /= 2 .AND. i /= me) then + if (share[i, STAT=stat] /= 37) error stop "Expected to get value from images alive." + print *, me, "Stat of #", i, " is:", stat + end if + end do + + sync all(STAT=stat) + if (me == 1) print *,"Test passed." + end associate + +end program image_fail_and_get_test_1 + diff --git a/src/tests/unit/fail_images/image_fail_and_status_test_1.f90 b/src/tests/unit/fail_images/image_fail_and_status_test_1.f90 new file mode 100644 index 000000000..917b5a919 --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_and_status_test_1.f90 @@ -0,0 +1,33 @@ +! Check that the status of a failed image is retrieved correctly. +! Image two is to fail, all others to continue. + +program image_fail_and_status_test_1 + use iso_fortran_env , only : STAT_FAILED_IMAGE, STAT_STOPPED_IMAGE + implicit none + integer :: i, stat + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + do i= 1, np + if (image_status(i) /= 0) error stop "image_status(i) should not fail" + end do + + ! Need to sync here or above image_status might catch a fail already. + sync all + if (me == 2) fail image + sync all (STAT=stat) + ! Check that all images returning from the sync report the failure of an image + print *,"sync all (STAT=", stat, ")" + if (stat /= STAT_FAILED_IMAGE) error stop "Expected sync all (STAT == STAT_FAILED_IMAGE)." + + do i= 1, np + stat = image_status(i) + if (i /= 2 .AND. stat /= 0 .AND. stat /= STAT_STOPPED_IMAGE) error stop "image_status(i) should not fail" + if (i == 2 .AND. stat /= STAT_FAILED_IMAGE) error stop "image_status(2) should report fail" + end do + + if (me == 1) print *,"Test passed." + end associate + +end program image_fail_and_status_test_1 + diff --git a/src/tests/unit/fail_images/image_fail_and_stopped_images_test_1.f90 b/src/tests/unit/fail_images/image_fail_and_stopped_images_test_1.f90 new file mode 100644 index 000000000..3980d618e --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_and_stopped_images_test_1.f90 @@ -0,0 +1,30 @@ +! Check that letting images exit the stopped_images function returns the +! correct indices. +! Image two is to stop, all others to continue. + +program image_fail_and_stopped_images_test_1 + implicit none + integer :: i, stat + integer, allocatable :: simages(:) + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + + ! Need a sync here to make sure all images are started and to prevent image fail + ! is not already detected in above image_status(i). + sync all + if (me == 2) stop 0 + sync all (STAT=stat) + + simages = stopped_images() + if (size(simages) /= 1) error stop "stopped_images()'s size should be one." + if (simages(1) /= 2) then + print *, me, "stopped image: ", simages(1) + error stop "The second image should have stopped." + end if + + if (me == 1) print *,"Test passed." + end associate + +end program image_fail_and_stopped_images_test_1 + diff --git a/src/tests/unit/fail_images/image_fail_and_sync_test_1.f90 b/src/tests/unit/fail_images/image_fail_and_sync_test_1.f90 new file mode 100644 index 000000000..fb80d8646 --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_and_sync_test_1.f90 @@ -0,0 +1,27 @@ +! Check that after an image has failed a sync all ends correctly. +! Image two is to fail, all others to continue. + +program image_fail_and_sync_test_1 + use iso_fortran_env , only : STAT_FAILED_IMAGE + implicit none + integer :: i, syncAllStat + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + do i= 1, np + if (image_status(i) /= 0) error stop "image_status(i) should not fail" + end do + + ! Need a sync here to make sure all images are started and to prevent image fail + ! is not already detected in above image_status(i). + sync all + if (me == 2) fail image + sync all (STAT=syncAllStat) + ! Check that all images returning from the sync report the failure of an image + if (syncAllStat /= STAT_FAILED_IMAGE) error stop "Expected sync all (STAT == STAT_FAILED_IMAGE)." + + if (me == 1) print *,"Test passed." + end associate + +end program image_fail_and_sync_test_1 + diff --git a/src/tests/unit/fail_images/image_fail_and_sync_test_2.f90 b/src/tests/unit/fail_images/image_fail_and_sync_test_2.f90 new file mode 100644 index 000000000..44dc93247 --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_and_sync_test_2.f90 @@ -0,0 +1,25 @@ +! Check that after an image has failed a sync images ends correctly. +! Image two is to fail, all others to continue. + +program image_fail_and_sync_test_2 + use iso_fortran_env , only : STAT_FAILED_IMAGE + implicit none + integer :: i, syncAllStat + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + do i= 1, np + if (image_status(i) /= 0) error stop "image_status(i) should not fail" + end do + + sync all + if (me == 2) fail image + sync images (*, STAT=syncAllStat) + ! Check that all images returning from the sync report the failure of an image + if (syncAllStat /= STAT_FAILED_IMAGE) error stop "Expected sync all (STAT == STAT_FAILED_IMAGE)." + + if (me == 1) print *,"Test passed." + end associate + +end program image_fail_and_sync_test_2 + diff --git a/src/tests/unit/fail_images/image_fail_and_sync_test_3.f90 b/src/tests/unit/fail_images/image_fail_and_sync_test_3.f90 new file mode 100644 index 000000000..2cc92dd96 --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_and_sync_test_3.f90 @@ -0,0 +1,24 @@ +! Check that after an image has failed a sync images ends correctly. +! Image two is to fail, all others to continue. + +program image_fail_and_sync_test_3 + use iso_fortran_env , only : STAT_FAILED_IMAGE + implicit none + integer :: i, syncAllStat + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + do i= 1, np + if (image_status(i) /= 0) error stop "image_status(i) should not fail" + end do + + sync all + if (me == 2) fail image + sync all(STAT=syncAllStat) + if (image_status(2) /= STAT_FAILED_IMAGE) error stop "Expected STAT_FAILED_IMAGE for image 2." + + if (me == 1) print *,"Test passed." + end associate + +end program image_fail_and_sync_test_3 + diff --git a/src/tests/unit/fail_images/image_fail_test_1.f90 b/src/tests/unit/fail_images/image_fail_test_1.f90 new file mode 100644 index 000000000..544ea0c8d --- /dev/null +++ b/src/tests/unit/fail_images/image_fail_test_1.f90 @@ -0,0 +1,21 @@ +! Check that failing an image works. +! Image two is to fail, all others to continue. + +program image_fail_test_1 + use iso_fortran_env , only : STAT_FAILED_IMAGE + implicit none + integer :: i, syncAllStat + + associate(np => num_images(), me => this_image()) + if (np < 3) error stop "I need at least 3 images to function." + + if (me == 2) fail image + + if (me == 2) print *, "Test failed." + + sync all (STAT=syncAllStat) + if (me == 1) print *, "Test passed." + end associate + +end program image_fail_test_1 + diff --git a/src/tests/unit/image_states/image_status_test_1.f90 b/src/tests/unit/fail_images/image_status_test_1.f90 similarity index 82% rename from src/tests/unit/image_states/image_status_test_1.f90 rename to src/tests/unit/fail_images/image_status_test_1.f90 index 13f8cbb7e..e6cf521d4 100644 --- a/src/tests/unit/image_states/image_status_test_1.f90 +++ b/src/tests/unit/fail_images/image_status_test_1.f90 @@ -1,3 +1,5 @@ +! Check the status of all images. Error only, when one unexpectedly failed. + program test_image_status_1 use iso_fortran_env , only : STAT_STOPPED_IMAGE implicit none diff --git a/src/tests/unit/image_states/CMakeLists.txt b/src/tests/unit/image_states/CMakeLists.txt deleted file mode 100644 index cd3ef86ca..000000000 --- a/src/tests/unit/image_states/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_executable(image_status_test_1 image_status_test_1.f90) -target_link_libraries(image_status_test_1 OpenCoarrays) - From d45aa2c6510bf9c4d2a2386541fb7b52c297e09f Mon Sep 17 00:00:00 2001 From: Damian Rouson Date: Tue, 23 May 2017 21:17:04 +0300 Subject: [PATCH 55/61] Add comment to syncimages_status test --- src/tests/unit/sync/syncimages_status.f90 | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/unit/sync/syncimages_status.f90 b/src/tests/unit/sync/syncimages_status.f90 index a90e6d942..36f302cf4 100644 --- a/src/tests/unit/sync/syncimages_status.f90 +++ b/src/tests/unit/sync/syncimages_status.f90 @@ -18,4 +18,5 @@ program sync_images_stat if(me == 2) print *, 'Test passed.' end if + ! Image 1 implicitly synchronizes as part of normal termination end program sync_images_stat From 0eb8345796a1d7ea6a344cac36f04b99af8bcd9a Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Tue, 23 May 2017 17:39:22 -0400 Subject: [PATCH 56/61] Update build system for failed images - Ensure that tests requiring GCC 7 are only built when it is available - Add a default option (set to ON) to enable failed images support when the required experimental/proposed fault tolerant MPI features are detected. --- CMakeLists.txt | 34 +++++++++++----------- src/mpi/CMakeLists.txt | 54 +++++++++++++++++++++++++++++++++++ src/mpi/mpi_caf.c | 2 +- src/tests/unit/CMakeLists.txt | 4 ++- 4 files changed, 76 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e267b0e0..8cfd1f212 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -465,7 +465,7 @@ function(add_mpi_test name num_mpi_proc path) set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.") endfunction(add_mpi_test) -function(add_mpi_failable_test name num_mpi_proc path) +function(add_fault_tolerant_mpi_test name num_mpi_proc path) if ( ((N LESS num_mpi_proc) OR (N EQUAL 0)) ) message(STATUS "Test ${name} is oversubscribed: ${num_mpi_proc} ranks requested with ${N} system processor available.") if ( openmpi ) @@ -481,7 +481,7 @@ function(add_mpi_failable_test name num_mpi_proc path) set(test_parameters ${test_parameters} ${MPIEXEC_NUMPROC_FLAG} ${num_mpi_proc} -disable-auto-cleanup ) add_test(NAME ${name} COMMAND ${MPIEXEC} ${test_parameters} "${path}") set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.") -endfunction(add_mpi_failable_test) +endfunction(add_fault_tolerant_mpi_test) set(tests_root ${CMAKE_CURRENT_BINARY_DIR}/src/tests) @@ -551,20 +551,22 @@ if(opencoarrays_aware_compiler) add_mpi_test(co_reduce_string 4 ${tests_root}/unit/collectives/co_reduce_string) # IMAGE FAIL tests - add_mpi_test(image_status_test_1 4 ${tests_root}/unit/fail_images/image_status_test_1) -#ifdef WITH_FAIL_IMAGES -# No other way to check that image_fail_test_1 passes. - add_mpi_failable_test(image_fail_test_1 4 ${tests_root}/unit/fail_images/image_fail_test_1) - set_property(TEST image_fail_test_1 PROPERTY FAIL_REGULAR_EXPRESSION "Test failed") - set_property(TEST image_fail_test_1 PROPERTY PASS_REGULAR_EXPRESSION "Test passed") - add_mpi_failable_test(image_fail_and_sync_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_1) - add_mpi_failable_test(image_fail_and_sync_test_2 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_2) - add_mpi_failable_test(image_fail_and_sync_test_3 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_3) - add_mpi_failable_test(image_fail_and_status_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_status_test_1) - add_mpi_failable_test(image_fail_and_failed_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_failed_images_test_1) - add_mpi_failable_test(image_fail_and_stopped_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_stopped_images_test_1) - add_mpi_failable_test(image_fail_and_get_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_get_test_1) -#endif + if(NOT CMAKE_Fortran_COMPILER_VERSION VERSION_LESS 7) + add_mpi_test(image_status_test_1 4 ${tests_root}/unit/fail_images/image_status_test_1) + if(CAF_ENABLE_FAILED_IMAGES) + # No other way to check that image_fail_test_1 passes. + add_fault_tolerant_mpi_test(image_fail_test_1 4 ${tests_root}/unit/fail_images/image_fail_test_1) + set_property(TEST image_fail_test_1 PROPERTY FAIL_REGULAR_EXPRESSION "Test failed") + set_property(TEST image_fail_test_1 PROPERTY PASS_REGULAR_EXPRESSION "Test passed") + add_fault_tolerant_mpi_test(image_fail_and_sync_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_1) + add_fault_tolerant_mpi_test(image_fail_and_sync_test_2 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_2) + add_fault_tolerant_mpi_test(image_fail_and_sync_test_3 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_3) + add_fault_tolerant_mpi_test(image_fail_and_status_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_status_test_1) + add_fault_tolerant_mpi_test(image_fail_and_failed_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_failed_images_test_1) + add_fault_tolerant_mpi_test(image_fail_and_stopped_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_stopped_images_test_1) + add_fault_tolerant_mpi_test(image_fail_and_get_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_get_test_1) + endif() + endif() else() add_test(co_sum_extension ${tests_root}/unit/extensions/test-co_sum-extension.sh) set_property(TEST co_sum_extension PROPERTY PASS_REGULAR_EXPRESSION "Test passed.") diff --git a/src/mpi/CMakeLists.txt b/src/mpi/CMakeLists.txt index 26e65f1b9..101f5b774 100644 --- a/src/mpi/CMakeLists.txt +++ b/src/mpi/CMakeLists.txt @@ -28,6 +28,60 @@ if(CAF_EXPOSE_INIT_FINALIZE) add_definitions(-DEXPOSE_INIT_FINALIZE) endif() +#---------------------------------------------------------------------- +# Test if MPI implementation provides features needed for failed images +#---------------------------------------------------------------------- +set(NEEDED_SYMBOLS MPIX_ERR_PROC_FAILED;MPIX_ERR_REVOKED;MPIX_Comm_failure_ack;MPIX_Comm_failure_get_acked;MPIX_Comm_shrink;MPIX_Comm_agree) +set(MPI_HAS_FAULT_TOL_EXT YES) +set(old_cmake_required_includes "${CMAKE_REQUIRED_INCLUDES}") +if(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES};${MPI_C_INCLUDE_PATH}) +else() + set(CMAKE_REQUIRED_INCLUDES ${MPI_C_INCLUDE_PATH}) +endif() +set(old_cmake_required_flags "${CMAKE_REQUIRED_FLAGS}") +if(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS};${MPI_C_COMPILE_FLAGS};${MPI_C_LINK_FLAGS}) +else() + set(CMAKE_REQUIRED_FLAGS ${MPI_C_COMPILE_FLAGS};${MPI_C_LINK_FLAGS}) +endif() +set(old_cmake_required_libraries "${CMAKE_REQUIRED_LIBRARIES}") +if(CMAKE_REQUIRED_LIBRARIES) + set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES};${MPI_C_LIBRARIES}) +else() + set(CMAKE_REQUIRED_LIBRARIES ${MPI_C_LIBRARIES}) +endif() + +set(MPI_HEADERS mpi.h) +CHECK_INCLUDE_FILE("mpi-ext.h" HAVE_MPI_EXT) +if(HAVE_MPI_EXT) + add_definitions(-DHAVE_MPI_EXT_H) + set(MPI_HEADERS ${MPI_HEADERS};mpi-ext.h) +endif() +include(CheckSymbolExists) +foreach(symbol ${NEEDED_SYMBOLS}) + CHECK_SYMBOL_EXISTS(${symbol} ${MPI_HEADERS} HAVE_${symbol}) + if(NOT HAVE_${symbol}) + message( STATUS "\${HAVE_${symbol}} = ${HAVE_${symbol}}") + message( WARNING "Disabling Failed Image support due to lack of support in the current MPI implementation.") + set(MPI_HAS_FAULT_TOL_EXT NO) + break() # no need to keep looking + endif() +endforeach(symbol) +set(CMAKE_REQUIRED_INCLUDES ${old_cmake_required_includes}) +set(CMAKE_REQUIRED_FLAGS ${old_cmake_required_flags}) +set(CMAKE_REQUIRED_LIBRARIES ${old_cmake_required_libraries}) + +if(MPI_HAS_FAULT_TOL_EXT) + option(CAF_ENABLE_FAILED_IMAGES "Enable failed images support" TRUE) +else() + set(CAF_ENABLE_FAILED_IMAGES FALSE CACHE BOOL "Enable failed images support" FORCE) +endif() + +if(CAF_ENABLE_FAILED_IMAGES) + add_definitions(-DUSE_FAILED_IMAGES) +endif() + # Determine whether and how to include OpenCoarrays module based on if the Fortran MPI compiler: # - workds # - is compatible with the fortran compiler used to build the MPI implementation diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index ebf807e2d..83ff5ce17 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -43,7 +43,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include /* For raise */ -#ifdef MPI_NEEDS_MPI_EXT +#ifdef HAVE_MPI_EXT_H #include #endif #ifdef USE_FAILED_IMAGES diff --git a/src/tests/unit/CMakeLists.txt b/src/tests/unit/CMakeLists.txt index 9a6a6e305..015ce1172 100644 --- a/src/tests/unit/CMakeLists.txt +++ b/src/tests/unit/CMakeLists.txt @@ -4,7 +4,9 @@ if (${opencoarrays_aware_compiler}) add_subdirectory(init_register) add_subdirectory(collectives) add_subdirectory(sync) - add_subdirectory(fail_images) + if(NOT CMAKE_Fortran_COMPILER_VERSION VERSION_LESS 7) + add_subdirectory(fail_images) + endif() else() add_subdirectory(extensions) endif() From dd8442e53868d722bff0b099991776a3da804ab1 Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Tue, 23 May 2017 17:45:36 -0400 Subject: [PATCH 57/61] Rename CMake OpenCoarrays developer tests option This will create better grouping in `ccmake` and `cmake-gui` --- CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8cfd1f212..fa02a5d76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,11 +8,11 @@ set_property ( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYP # Add option and check environment to determine if developer tests should be run if($ENV{OPENCOARRAYS_DEVELOPER}) - option(RUN_DEVELOPER_TESTS "Run tests intended only for developers" ON) + option(CAF_RUN_DEVELOPER_TESTS "Run tests intended only for developers" ON) else() - option(RUN_DEVELOPER_TESTS "Run tests intended only for developers" OFF) + option(CAF_RUN_DEVELOPER_TESTS "Run tests intended only for developers" OFF) endif() -mark_as_advanced(RUN_DEVELOPER_TESTS) +mark_as_advanced(CAF_RUN_DEVELOPER_TESTS) if( NOT DEFINED ENV{OPENCOARRAYS_DEVELOPER}) set ( ENV{OPENCOARRAYS_DEVELOPER} FALSE ) @@ -498,7 +498,7 @@ if(opencoarrays_aware_compiler) add_mpi_test(register_alloc_comp_1 2 ${tests_root}/unit/init_register/register_alloc_comp_1) add_mpi_test(register_alloc_comp_2 2 ${tests_root}/unit/init_register/register_alloc_comp_2) add_mpi_test(register_alloc_comp_3 2 ${tests_root}/unit/init_register/register_alloc_comp_3) - if (RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER}) + if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER}) message ( STATUS "Running Developer tests is enabled." ) add_mpi_test(async_comp_alloc 6 ${tests_root}/unit/init_register/async_comp_alloc) # Timeout async_comp_alloc test after 3 seconds to progess past the known failure @@ -541,7 +541,7 @@ if(opencoarrays_aware_compiler) # GFortran PR 78505 only fixed on trunk/gcc 7 add_mpi_test(source-alloc-no-sync 8 ${tests_root}/regression/reported/source-alloc-sync) endif() - if (RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER}) + if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER}) add_mpi_test(convert-before-put 3 ${tests_root}/regression/reported/convert-before-put) endif() add_mpi_test(event-post 3 ${tests_root}/regression/reported/event-post) From 151eb80597d47475bf290528957c02f39b6bc40a Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Tue, 23 May 2017 17:46:39 -0400 Subject: [PATCH 58/61] Handle `` better - Test for existence, assume symbols/functionality provided elsewhere if it is missing. Should induce compile time error if this is not true. --- src/mpi/CMakeLists.txt | 7 +++++++ src/mpi/mpi_caf.c | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/mpi/CMakeLists.txt b/src/mpi/CMakeLists.txt index 101f5b774..57e71709e 100644 --- a/src/mpi/CMakeLists.txt +++ b/src/mpi/CMakeLists.txt @@ -28,6 +28,13 @@ if(CAF_EXPOSE_INIT_FINALIZE) add_definitions(-DEXPOSE_INIT_FINALIZE) endif() +include(CheckIncludeFile) +CHECK_INCLUDE_FILE("alloca.h" HAVE_ALLOCA) +if(NOT HAVE_ALLOCA) + add_definitions(-DALLOCA_MISSING) + message(WARNING "Could not find . Assuming functionality is provided elsewhere.") +endif() + #---------------------------------------------------------------------- # Test if MPI implementation provides features needed for failed images #---------------------------------------------------------------------- diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 83ff5ce17..21a23c499 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -37,7 +37,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include /* For memcpy. */ #include /* For variadic arguments. */ -#include +#ifndef ALLOCA_MISSING +#include /* Assume functionality provided elsewhere if missing */ +#endif #include #include #include From 59b0165d82871d9003b8a878c449dea74d0a3a28 Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Tue, 23 May 2017 18:59:02 -0400 Subject: [PATCH 59/61] Ensure that libcaf.h is early in includes May fix #388 (issue with Clang 4 on FreeBSD) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fa02a5d76..3201a389c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -387,7 +387,7 @@ include(GNUInstallDirs) #------------------------------- # Recurse into the src directory #------------------------------- -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) +include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src) add_subdirectory(src) From 9c6eb822a0906bba018a0fdb607faa8eefcf7507 Mon Sep 17 00:00:00 2001 From: Izaak Beekman Date: Wed, 24 May 2017 19:41:58 -0400 Subject: [PATCH 60/61] Flag image_fail_and_sync_test_2 as developer - Intermittent failure on Travis-CI when run repeatedly `ctest --output-on-failure --repeat-until-fail 10` - Bug report: https://github.com/sourceryinstitute/OpenCoarrays/issues/390 --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3201a389c..479baaeed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -559,7 +559,9 @@ if(opencoarrays_aware_compiler) set_property(TEST image_fail_test_1 PROPERTY FAIL_REGULAR_EXPRESSION "Test failed") set_property(TEST image_fail_test_1 PROPERTY PASS_REGULAR_EXPRESSION "Test passed") add_fault_tolerant_mpi_test(image_fail_and_sync_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_1) - add_fault_tolerant_mpi_test(image_fail_and_sync_test_2 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_2) + if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER}) + add_fault_tolerant_mpi_test(image_fail_and_sync_test_2 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_2) + endif() add_fault_tolerant_mpi_test(image_fail_and_sync_test_3 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_3) add_fault_tolerant_mpi_test(image_fail_and_status_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_status_test_1) add_fault_tolerant_mpi_test(image_fail_and_failed_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_failed_images_test_1) From 120210ae3d21f31c8102458ca8b21064124d4be5 Mon Sep 17 00:00:00 2001 From: Andre Vehreschild Date: Thu, 25 May 2017 13:44:29 +0200 Subject: [PATCH 61/61] Do not shrink communicator when number of failed images is out of bounds. That the number of images failed is reported incorrectly, may happen when in the finalization stage. To prevent this, the mpi-error handler now checks for the correct number and on incorrect ones just exists ok. This commit should Fix #390. --- src/mpi/mpi_caf.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index ebf807e2d..c21666b86 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -394,6 +394,11 @@ failed_stopped_errorhandler_function (MPI_Comm* pcomm, int* perr, ...) *perr = MPI_SUCCESS; return; } + if (num_failed_in_group > caf_num_images) + { + *perr = MPI_SUCCESS; + return; + } MPI_Comm_group (comm, &comm_world_group); ranks_of_failed_in_comm_world = (int *) alloca (sizeof (int) @@ -451,8 +456,20 @@ failed_stopped_errorhandler_function (MPI_Comm* pcomm, int* perr, ...) /* TODO: Consider whether removing the failed image from images_full will be * necessary. This is more or less politics. */ for (i = 0; i < num_failed_in_group; ++i) - if (image_stati[ranks_of_failed_in_comm_world[i]] == 0) - image_stati[ranks_of_failed_in_comm_world[i]] = STAT_FAILED_IMAGE; + { + if (ranks_of_failed_in_comm_world[i] >= 0 + && ranks_of_failed_in_comm_world[i] < caf_num_images) + { + if (image_stati[ranks_of_failed_in_comm_world[i]] == 0) + image_stati[ranks_of_failed_in_comm_world[i]] = STAT_FAILED_IMAGE; + } + else + { + dprint ("%d/%d: Rank of failed image %d out of range of images 0..%d.\n", + caf_this_image, caf_num_images, ranks_of_failed_in_comm_world[i], + caf_num_images); + } + } redo: dprint ("%d/%d: %s: Before shrink. \n", caf_this_image, caf_num_images, __FUNCTION__);