Whamcloud - gitweb
LU-16393 o2iblnd: add IBLND_REJECT_EARLY reject reason
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Thu, 13 Jul 2023 00:29:56 +0000 (17:29 -0700)
committerAndreas Dilger <adilger@whamcloud.com>
Mon, 28 Aug 2023 16:16:47 +0000 (16:16 +0000)
Add IBLND_REJECT_EARLY reason for rejecting connection request:
to be used when the device doesn't have any nets added yet or
when there's no active NIs on the net to handle the connection.
These conditions are supposed to occur only when LNI is being
added/initialized, so report at CNETERROR level vs. CERROR.

In lnet, set NI state to ACTIVE only after it has been added
to the list of NIs for the net, so that LND can know that
the NI can be used to accept connections.

Lustre-change: https://review.whamcloud.com/51651
Lustre-commit: 673ff86a84ad5d11cde24aa7411c45385ad1c633

Test-parameters: trivial
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I59efb2fdf5d5ceabb6ff23f638ec85da82d57b99
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/52015
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/lnet/api-ni.c

index 75e88b0..03bad00 100644 (file)
@@ -580,6 +580,7 @@ struct kib_rej {
 /* peer_ni's msg queue size doesn't match mine */
 #define IBLND_REJECT_MSG_QUEUE_SIZE  7
 #define IBLND_REJECT_INVALID_SRV_ID  8
+#define IBLND_REJECT_EARLY          9          /* NI not initialized yet */
 
 /***********************************************************************/
 
index fbc26bd..f8cc52f 100644 (file)
@@ -2491,6 +2491,31 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
        if (ni != NULL) {
                net = (struct kib_net *)ni->ni_data;
                rej.ibr_incarnation = net->ibn_incarnation;
+       } else {
+               if (ibdev->ibd_nnets == 0) {
+                       rej.ibr_why = IBLND_REJECT_EARLY;
+                       CNETERR("Can't accept conn from %s on %s (%s:%d:%pI4h): net for nid %s not added yet\n",
+                               libcfs_nid2str(nid),
+                               libcfs_nid2str(net->ibn_ni->ni_nid),
+                               ibdev->ibd_ifname, ibdev->ibd_nnets,
+                               &ibdev->ibd_ifip,
+                               libcfs_nid2str(reqmsg->ibm_dstnid));
+                       goto failed;
+               }
+               list_for_each_entry(net, &ibdev->ibd_nets, ibn_list) {
+                       if ((net->ibn_dev == ibdev) &&
+                           (net->ibn_ni != NULL) &&
+                           (net->ibn_ni->ni_state != LNET_NI_STATE_ACTIVE)) {
+                               rej.ibr_why = IBLND_REJECT_EARLY;
+                               CNETERR("Can't accept conn from %s on %s (%s:%d:%pI4h): nid %s not ready\n",
+                                      libcfs_nid2str(nid),
+                                      libcfs_nid2str(net->ibn_ni->ni_nid),
+                                      ibdev->ibd_ifname, ibdev->ibd_nnets,
+                                      &ibdev->ibd_ifip,
+                                      libcfs_nid2str(reqmsg->ibm_dstnid));
+                               goto failed;
+                       }
+               }
        }
 
        if (ni == NULL ||                         /* no matching net */
@@ -2955,6 +2980,11 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
                                        libcfs_nid2str(peer_ni->ibp_nid));
                                 break;
 
+                       case IBLND_REJECT_EARLY:
+                               CNETERR("%s rejected: tried too early\n",
+                                      libcfs_nid2str(peer_ni->ibp_nid));
+                               break;
+
                         default:
                                 CERROR("%s rejected: o2iblnd reason %d\n",
                                        libcfs_nid2str(peer_ni->ibp_nid),
index 9d54e67..46dcdee 100644 (file)
@@ -2304,10 +2304,6 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
                goto failed0;
        }
 
-       lnet_ni_lock(ni);
-       ni->ni_state = LNET_NI_STATE_ACTIVE;
-       lnet_ni_unlock(ni);
-
        /* We keep a reference on the loopback net through the loopback NI */
        if (net->net_lnd->lnd_type == LOLND) {
                lnet_ni_addref(ni);
@@ -2466,6 +2462,14 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
        lnet_net_lock(LNET_LOCK_EX);
        list_splice_tail(&local_ni_list, &net_l->net_ni_list);
        lnet_incr_dlc_seq();
+
+       list_for_each_entry(ni, &net_l->net_ni_list, ni_netlist) {
+               if (!ni)
+                       break;
+               lnet_ni_lock(ni);
+               ni->ni_state = LNET_NI_STATE_ACTIVE;
+               lnet_ni_unlock(ni);
+       }
        lnet_net_unlock(LNET_LOCK_EX);
 
        /* if the network is not unique then we don't want to keep