Skip to content

Commit 227b8c1

Browse files
committed
ofi: Delay monitor initialization
Delay initializing the import memory monitor from component open to a function which should be called immediately before the Libfabric domain is initialized. Registering the import memory monitor requires initializing the patcher memory hooks, which is not necessary if the OFI fabrics were not selected. Signed-off-by: Brian Barrett <[email protected]>
1 parent a5b3ebf commit 227b8c1

File tree

4 files changed

+152
-59
lines changed

4 files changed

+152
-59
lines changed

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
887887
}
888888
}
889889

890+
/* this must be called during single threaded part of the code and
891+
* before Libfabric configures its memory monitors. Easiest to do
892+
* that before domain open. Silently ignore not-supported errors,
893+
* as they are not critical to program correctness, but only
894+
* indicate that LIbfabric will have to pick a different, possibly
895+
* less optimial, monitor. */
896+
ret = opal_common_ofi_inject_memory_monitor();
897+
if (0 != ret && -FI_ENOSYS != ret) {
898+
opal_output_verbose(1, opal_common_ofi.output,
899+
"Failed to inject Libfabric memory monitor: %s",
900+
fi_strerror(-ret));
901+
}
902+
903+
890904
/**
891905
* Open fabric
892906
* The getinfo struct returns a fabric attribute struct that can be used to

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,19 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
446446
* to prevent races. */
447447
mca_btl_ofi_rcache_init(module);
448448

449+
/* for similar reasons to the rcache call, this must be called
450+
* during single threaded part of the code and before Libfabric
451+
* configures its memory monitors. Easiest to do that before
452+
* domain open. Silently ignore not-supported errors, as they
453+
* are not critical to program correctness, but only indicate
454+
* that LIbfabric will have to pick a different, possibly less
455+
* optimial, monitor. */
456+
rc = opal_common_ofi_inject_memory_monitor();
457+
if (0 != rc && -FI_ENOSYS != rc) {
458+
BTL_VERBOSE(("Failed to inject Libfabric memory monitor: %s",
459+
fi_strerror(-rc)));
460+
}
461+
449462
linux_device_name = info->domain_attr->name;
450463
BTL_VERBOSE(
451464
("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name));

opal/mca/common/ofi/common_ofi.c

Lines changed: 113 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,26 @@
3636
#include "opal/util/show_help.h"
3737

3838
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
39-
.prov_exclude = NULL,
40-
.registered = 0,
41-
.verbose = 0};
42-
39+
.prov_exclude = NULL,
40+
.output = -1};
4341
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic";
4442
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT;
45-
static bool opal_common_ofi_initialized = false;
4643
static int opal_common_ofi_init_ref_cnt = 0;
44+
static bool opal_common_ofi_installed_memory_monitor = false;
4745

4846
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
4947

5048
/*
51-
* These no-op functions are necessary since libfabric does not allow null
52-
* function pointers here.
49+
* Monitor object to inject into Libfabric to provide memory release
50+
* notifications using our own memory hooks framework. Monitors may
51+
* use the subscribe/unsubscribe notifications to reduce unnecessary
52+
* notifications, but are not required to do so. Because patcher
53+
* notifies about all releases, it is cheaper for us to not filter and
54+
* this monitor can safely ignore subscribe/unsubscribe notifications.
55+
*
56+
* Libfabric requires the object to be fully defined. Unlike most of
57+
* Open MPI, it does not have NULL function pointer checks in calling
58+
* code.
5359
*/
5460
static int opal_common_ofi_monitor_start(struct fid_mem_monitor *monitor)
5561
{
@@ -80,7 +86,7 @@ static bool opal_common_ofi_monitor_valid(struct fid_mem_monitor *monitor,
8086
}
8187

8288
static struct fid_mem_monitor *opal_common_ofi_monitor;
83-
static struct fid *opal_common_ofi_cache_fid;
89+
static struct fid *opal_common_ofi_cache_fid = NULL;
8490
static struct fi_ops_mem_monitor opal_common_ofi_export_ops = {
8591
.size = sizeof(struct fi_ops_mem_monitor),
8692
.start = opal_common_ofi_monitor_start,
@@ -90,6 +96,12 @@ static struct fi_ops_mem_monitor opal_common_ofi_export_ops = {
9096
.valid = opal_common_ofi_monitor_valid,
9197
};
9298

99+
/**
100+
* Callback function from Open MPI memory monitor
101+
*
102+
* Translation function between the callback function from Open MPI's
103+
* memory notifier to the Libfabric memory monitor.
104+
*/
93105
static void opal_common_ofi_mem_release_cb(void *buf, size_t length,
94106
void *cbdata, bool from_alloc)
95107
{
@@ -99,87 +111,131 @@ static void opal_common_ofi_mem_release_cb(void *buf, size_t length,
99111

100112
#endif /* HAVE_STRUCT_FI_OPS_MEM_MONITOR */
101113

102-
int opal_common_ofi_open(void)
114+
int opal_common_ofi_inject_memory_monitor(void)
103115
{
104-
int ret;
116+
int ret = -FI_ENOSYS;
105117

106-
opal_common_ofi_init_ref_cnt++;
107-
if (opal_common_ofi_initialized) {
108-
return OPAL_SUCCESS;
109-
}
110118
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
119+
uint32_t version;
111120

112-
mca_base_framework_open(&opal_memory_base_framework, 0);
121+
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
122+
123+
if (opal_common_ofi_cache_fid != NULL) {
124+
return 0;
125+
}
126+
127+
/*
128+
* While the memory import functionality was introduced in 1.13,
129+
* some deadlock bugs exist in the 1.13 series. Require version
130+
* 1.14 before this code is activated. Not activating the code
131+
* should not break any functionality directly, but may lead to
132+
* sub-optimal memory monitors being used in Libfabric, as Open
133+
* MPI will almost certainly install a patcher first.
134+
*/
135+
version = fi_version();
136+
if (FI_VERSION_LT(version, FI_VERSION(1, 14))) {
137+
ret = -FI_ENOSYS;
138+
goto err;
139+
}
140+
141+
ret = mca_base_framework_open(&opal_memory_base_framework, 0);
142+
if (OPAL_SUCCESS != ret) {
143+
ret = -FI_ENOSYS;
144+
goto err;
145+
}
113146
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)
114147
!= (((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT))
115148
& opal_mem_hooks_support_level())) {
116-
return OPAL_SUCCESS;
149+
ret = -FI_ENOSYS;
150+
goto err;
117151
}
118152

119153
/*
120-
* This cache object doesn't do much, but is necessary for the API to work.
121-
* It is required to call the fi_import_fid API. This API was introduced in
122-
* libfabric version 1.13.0 and "mr_cache" is a "well known" name (documented
123-
* in libfabric) to indicate the type of object that we are trying to open.
154+
* The monitor inject object has the well known name "mr_cache"
155+
* and was introduced in Libfabric 1.13
124156
*/
125-
ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, &opal_common_ofi_cache_fid, NULL);
126-
if (ret) {
157+
ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0,
158+
&opal_common_ofi_cache_fid, NULL);
159+
if (0 != ret) {
127160
goto err;
128161
}
129162

130163
opal_common_ofi_monitor = calloc(1, sizeof(*opal_common_ofi_monitor));
131-
if (!opal_common_ofi_monitor) {
164+
if (NULL == opal_common_ofi_monitor) {
165+
ret = -FI_ENOMEM;
132166
goto err;
133167
}
134168

135169
opal_common_ofi_monitor->fid.fclass = FI_CLASS_MEM_MONITOR;
136170
opal_common_ofi_monitor->export_ops = &opal_common_ofi_export_ops;
137-
/*
138-
* This import_fid call must occur before the libfabric provider creates
139-
* its memory registration cache. This will typically occur during domain
140-
* open as it is a domain level object. We put it early in initialization
141-
* to guarantee this and share the import monitor between the ofi btl
142-
* and ofi mtl.
143-
*/
144-
ret = fi_import_fid(opal_common_ofi_cache_fid, &opal_common_ofi_monitor->fid, 0);
145-
if (ret) {
171+
ret = fi_import_fid(opal_common_ofi_cache_fid,
172+
&opal_common_ofi_monitor->fid, 0);
173+
if (0 != ret) {
146174
goto err;
147175
}
148176
opal_mem_hooks_register_release(opal_common_ofi_mem_release_cb, NULL);
149-
opal_common_ofi_initialized = true;
177+
opal_common_ofi_installed_memory_monitor = true;
178+
179+
ret = 0;
150180

151-
return OPAL_SUCCESS;
152181
err:
153-
if (opal_common_ofi_cache_fid) {
154-
fi_close(opal_common_ofi_cache_fid);
155-
}
156-
if (opal_common_ofi_monitor) {
157-
free(opal_common_ofi_monitor);
182+
if (0 != ret) {
183+
if (opal_common_ofi_cache_fid) {
184+
fi_close(opal_common_ofi_cache_fid);
185+
}
186+
if (opal_common_ofi_monitor) {
187+
free(opal_common_ofi_monitor);
188+
}
158189
}
159190

160-
return OPAL_ERROR;
161-
#else
162-
opal_common_ofi_initialized = true;
163-
return OPAL_SUCCESS;
191+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
164192
#endif
193+
194+
return ret;
165195
}
166196

167-
int opal_common_ofi_close(void)
197+
static int opal_common_ofi_remove_memory_monitor(void)
168198
{
169-
if (opal_common_ofi_initialized && !--opal_common_ofi_init_ref_cnt) {
170199
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
200+
if (opal_common_ofi_installed_memory_monitor) {
171201
opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb);
172202
fi_close(opal_common_ofi_cache_fid);
173203
fi_close(&opal_common_ofi_monitor->fid);
174204
free(opal_common_ofi_monitor);
205+
opal_common_ofi_installed_memory_monitor = false;
206+
}
175207
#endif
176-
opal_common_ofi_initialized = false;
208+
209+
return OPAL_SUCCESS;
210+
}
211+
212+
int opal_common_ofi_open(void)
213+
{
214+
if ((opal_common_ofi_init_ref_cnt++) > 0) {
215+
return OPAL_SUCCESS;
177216
}
178217

179-
opal_common_ofi.registered--;
180-
assert(opal_common_ofi.registered >= 0);
181-
if (opal_common_ofi.registered == 0) {
182-
opal_output_close(opal_common_ofi.output);
218+
return OPAL_SUCCESS;
219+
}
220+
221+
int opal_common_ofi_close(void)
222+
{
223+
int ret;
224+
225+
if ((--opal_common_ofi_init_ref_cnt) > 0) {
226+
return OPAL_SUCCESS;
227+
}
228+
229+
ret = opal_common_ofi_remove_memory_monitor();
230+
if (ret != OPAL_SUCCESS) {
231+
return ret;
232+
}
233+
234+
if (opal_common_ofi.output != -1) {
235+
opal_output_close(opal_common_ofi.output);
236+
if (ret != OPAL_SUCCESS) {
237+
return ret;
238+
}
183239
}
184240

185241
return OPAL_SUCCESS;
@@ -209,18 +265,13 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
209265
static int include_index;
210266
static int exclude_index;
211267
static int verbose_index;
268+
int verbose;
212269
int param;
213270

214271
if (fi_version() < FI_VERSION(1, 0)) {
215272
return OPAL_ERROR;
216273
}
217274

218-
opal_common_ofi.registered++;
219-
if (opal_common_ofi.registered == 1) {
220-
opal_common_ofi.output = opal_output_open(NULL);
221-
opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose);
222-
}
223-
224275
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
225276

226277
param = mca_base_var_find("opal", "opal_common", "ofi", "provider_include");
@@ -272,7 +323,7 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
272323
MCA_BASE_VAR_TYPE_INT, NULL, 0,
273324
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
274325
MCA_BASE_VAR_SCOPE_LOCAL,
275-
&opal_common_ofi.verbose);
326+
&verbose);
276327
} else {
277328
verbose_index = param;
278329
}
@@ -289,6 +340,11 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
289340
"verbose", 0);
290341
}
291342

343+
if (opal_common_ofi.output == -1) {
344+
opal_common_ofi.output = opal_output_open(NULL);
345+
opal_output_set_verbosity(opal_common_ofi.output, verbose);
346+
}
347+
292348
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
293349

294350
return OPAL_SUCCESS;

opal/mca/common/ofi/common_ofi.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ BEGIN_C_DECLS
2626
typedef struct opal_common_ofi_module {
2727
char **prov_include;
2828
char **prov_exclude;
29-
int verbose;
30-
int registered;
3129
int output;
3230
} opal_common_ofi_module_t;
3331

@@ -69,6 +67,18 @@ OPAL_DECLSPEC int opal_common_ofi_open(void);
6967
*/
7068
OPAL_DECLSPEC int opal_common_ofi_close(void);
7169

70+
/**
71+
* Inject our memory hooks into Libfabric monitor
72+
*
73+
* Use Open MPI's memory hooks to provide monitor notifications to
74+
* Libfabric via the external mr_cache facility. This must be called
75+
* before any domain is initialized (ie, before any Libfabric memory
76+
* monitor is configured).
77+
*
78+
* @returns A libfabric error code is returned on error
79+
*/
80+
OPAL_DECLSPEC int opal_common_ofi_inject_memory_monitor(void);
81+
7282
/**
7383
* Search function for provider names
7484
*

0 commit comments

Comments
 (0)