From 673ff86a84ad5d11cde24aa7411c45385ad1c633 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Wed, 12 Jul 2023 17:29:56 -0700 Subject: [PATCH] LU-16393 o2iblnd: add IBLND_REJECT_EARLY reject reason Add IBLND_REJECT_EARLY reason for rejecting connection request: to be used when the device doesn't have any nets added yet or when there's no active NIs on the net to handle the connection. These conditions are supposed to occur only when LNI is being added/initialized, so report at CNETERROR level vs. CERROR. In lnet, set NI state to ACTIVE only after it has been added to the list of NIs for the net, so that LND can know that the NI can be used to accept connections. Test-parameters: trivial Signed-off-by: Serguei Smirnov Change-Id: I59efb2fdf5d5ceabb6ff23f638ec85da82d57b99 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51651 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Cyril Bordage Reviewed-by: Frank Sehr Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd-idl.h | 1 + lnet/klnds/o2iblnd/o2iblnd_cb.c | 30 ++++++++++++++++++++++++++++++ lnet/lnet/api-ni.c | 12 ++++++++---- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd-idl.h b/lnet/klnds/o2iblnd/o2iblnd-idl.h index 35df50b..544bab8 100644 --- a/lnet/klnds/o2iblnd/o2iblnd-idl.h +++ b/lnet/klnds/o2iblnd/o2iblnd-idl.h @@ -149,6 +149,7 @@ struct kib_rej { /* peer_ni's msg queue size doesn't match mine */ #define IBLND_REJECT_MSG_QUEUE_SIZE 7 #define IBLND_REJECT_INVALID_SRV_ID 8 +#define IBLND_REJECT_EARLY 9 /* NI not initialized yet */ /***********************************************************************/ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 32e67bf..74006aa 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -2572,6 +2572,31 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) if (ni != NULL) { net = (struct kib_net *)ni->ni_data; rej.ibr_incarnation = net->ibn_incarnation; + } else { + if (ibdev->ibd_nnets == 0) { + rej.ibr_why = IBLND_REJECT_EARLY; + CNETERR("Can't accept conn from %s on %s (%s:%d:%pI4h): net for nid %s not added yet\n", + libcfs_nid2str(nid), + libcfs_nidstr(&net->ibn_ni->ni_nid), + ibdev->ibd_ifname, ibdev->ibd_nnets, + &ibdev->ibd_ifip, + libcfs_nid2str(reqmsg->ibm_dstnid)); + goto failed; + } + list_for_each_entry(net, &ibdev->ibd_nets, ibn_list) { + if ((net->ibn_dev == ibdev) && + (net->ibn_ni != NULL) && + (net->ibn_ni->ni_state != LNET_NI_STATE_ACTIVE)) { + rej.ibr_why = IBLND_REJECT_EARLY; + CNETERR("Can't accept conn from %s on %s (%s:%d:%pI4h): nid %s not ready\n", + libcfs_nid2str(nid), + libcfs_nidstr(&net->ibn_ni->ni_nid), + ibdev->ibd_ifname, ibdev->ibd_nnets, + &ibdev->ibd_ifip, + libcfs_nid2str(reqmsg->ibm_dstnid)); + goto failed; + } + } } if (ni == NULL || /* no matching net */ @@ -3037,6 +3062,11 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) libcfs_nid2str(peer_ni->ibp_nid)); break; + case IBLND_REJECT_EARLY: + CNETERR("%s rejected: tried too early\n", + libcfs_nid2str(peer_ni->ibp_nid)); + break; + default: CERROR("%s rejected: o2iblnd reason %d\n", libcfs_nid2str(peer_ni->ibp_nid), diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 8251b81..bc82b02 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -2550,10 +2550,6 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun) goto failed0; } - lnet_ni_lock(ni); - ni->ni_state = LNET_NI_STATE_ACTIVE; - lnet_ni_unlock(ni); - /* We keep a reference on the loopback net through the loopback NI */ if (net->net_lnd->lnd_type == LOLND) { lnet_ni_addref(ni); @@ -2729,6 +2725,14 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun) lnet_net_lock(LNET_LOCK_EX); list_splice_tail(&local_ni_list, &net_l->net_ni_list); lnet_incr_dlc_seq(); + + list_for_each_entry(ni, &net_l->net_ni_list, ni_netlist) { + if (!ni) + break; + lnet_ni_lock(ni); + ni->ni_state = LNET_NI_STATE_ACTIVE; + lnet_ni_unlock(ni); + } lnet_net_unlock(LNET_LOCK_EX); /* if the network is not unique then we don't want to keep -- 1.8.3.1