From 8da460558997c42f5c32ee14bd24c4a33e52c40e Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 12 Dec 2018 14:53:11 +0900 Subject: [PATCH] btl/openib: immediately release the device when no port is allowed Many thanks to Sergey Oblomov for reporting this issue and the countless traces provided when troubleshooting it. This is a one-off commit for the v4.0.x branch since btl/openib has been removed from master. Refs. open-mpi/ompi#6137 Signed-off-by: Gilles Gouaillardet --- opal/mca/btl/openib/btl_openib.c | 10 +++++----- opal/mca/btl/openib/btl_openib.h | 2 ++ opal/mca/btl/openib/btl_openib_component.c | 14 +++++++++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 3bd5fe965da..c2686a0676a 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -1045,7 +1045,7 @@ int mca_btl_openib_add_procs( opal_bitmap_clear_all_bits(reachable); opal_show_help("help-mpi-btl-openib.txt", "ib port not selected", true, opal_process_info.nodename, - ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num); + openib_btl->device_name, openib_btl->port_num); return OPAL_SUCCESS; } @@ -1718,11 +1718,11 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) free(openib_btl->cpcs[i]); } free(openib_btl->cpcs); - } - /* Release device if there are no more users */ - if(!(--openib_btl->device->btls)) { - OBJ_RELEASE(openib_btl->device); + /* Release device if there are no more users */ + if(!(--openib_btl->device->allowed_btls)) { + OBJ_RELEASE(openib_btl->device); + } } if (NULL != openib_btl->qps) { diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index a5817a8daee..0b85bfb5662 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -392,6 +392,7 @@ typedef struct mca_btl_openib_device_t { /* Whether this device supports eager RDMA */ uint8_t use_eager_rdma; uint8_t btls; /** < number of btls using this device */ + uint8_t allowed_btls; /** < number of allowed btls using this device */ opal_pointer_array_t *endpoints; opal_pointer_array_t *device_btls; uint16_t hp_cq_polls; @@ -483,6 +484,7 @@ struct mca_btl_openib_module_t { uint8_t num_cpcs; mca_btl_openib_device_t *device; + char * device_name; uint8_t port_num; /**< ID of the PORT */ uint16_t pkey_index; struct ibv_port_attr ib_port_attr; diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 4a714b4d1b3..fcc0ac56973 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -648,9 +648,10 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, sizeof(mca_btl_openib_module)); ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl; - openib_btl->device = device; openib_btl->port_num = (uint8_t) port_num; openib_btl->allowed = false; + openib_btl->device = NULL; + openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev)); OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t); opal_list_append(btl_list, (opal_list_item_t*) ib_selected); opal_pointer_array_add(device->device_btls, (void*) openib_btl); @@ -784,6 +785,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl; openib_btl->device = device; + openib_btl->device_name = NULL; openib_btl->port_num = (uint8_t) port_num; openib_btl->pkey_index = pkey_index; openib_btl->lid = lid; @@ -904,6 +906,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, opal_list_append(btl_list, (opal_list_item_t*) ib_selected); opal_pointer_array_add(device->device_btls, (void*) openib_btl); ++device->btls; + ++device->allowed_btls; ++mca_btl_openib_component.ib_num_btls; ++mca_btl_openib_component.ib_allowed_btls; if (-1 != mca_btl_openib_component.ib_max_btls && @@ -1933,7 +1936,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) if (ib_port_attr.active_mtu < device->mtu){ device->mtu = ib_port_attr.active_mtu; } - if (mca_btl_openib_component.apm_ports && device->btls > 0) { + if (mca_btl_openib_component.apm_ports && device->allowed_btls > 0) { init_apm_port(device, i, ib_port_attr.lid); break; } @@ -1969,7 +1972,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) /* If we made a BTL, check APM status and return. Otherwise, fall through and destroy everything */ - if (device->btls > 0) { + if (device->allowed_btls > 0) { /* if apm was enabled it should be > 1 */ if (1 == mca_btl_openib_component.apm_ports) { opal_show_help("help-mpi-btl-openib.txt", @@ -2290,6 +2293,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) good: mca_btl_openib_component.devices_count++; return OPAL_SUCCESS; + } else if (device->btls > 0) { + /* no port is allowed to be used by btl/openib, + * so release the device right away */ + OBJ_RELEASE(device); + return OPAL_SUCCESS; } error: