Whamcloud - gitweb
LU-14955 lnet: Use fatal NI if none other available 46/44746/6
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Tue, 24 Aug 2021 20:48:41 +0000 (13:48 -0700)
committerOleg Drokin <green@whamcloud.com>
Thu, 1 Sep 2022 05:53:19 +0000 (05:53 +0000)
Allow NI in fatal state to be selected for sending if there are no
NIs in non-fatal state.

Test-Parameters: trivial testlist=sanity-lnet
HPE-bug-id: LUS-11019
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Iab8ef6ee5c5f45896196dbd88a2f61e004278297
Reviewed-on: https://review.whamcloud.com/44746
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/lib-move.c

index 81261c9..41d574a 100644 (file)
@@ -1635,6 +1635,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
        int best_healthv;
        __u32 best_sel_prio;
        unsigned int best_dev_prio;
+       int best_ni_fatal;
        unsigned int dev_idx = UINT_MAX;
        bool gpu = md ? (md->md_flags & LNET_MD_FLAG_GPU) : false;
 
@@ -1657,6 +1658,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                best_dev_prio = UINT_MAX;
                best_credits = INT_MIN;
                best_healthv = 0;
+               best_ni_fatal = true;
        } else {
                best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx);
                shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
@@ -1664,6 +1666,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                best_credits = atomic_read(&best_ni->ni_tx_credits);
                best_healthv = atomic_read(&best_ni->ni_healthv);
                best_sel_prio = best_ni->ni_sel_priority;
+               best_ni_fatal = atomic_read(&best_ni->ni_fatal_error_on);
        }
 
        while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
@@ -1701,20 +1704,25 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                 * Select on health, selection policy, direct dma prio,
                 * shorter distance, available credits, then round-robin.
                 */
-               if (ni_fatal)
-                       continue;
-
                if (best_ni)
-                       CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n",
-                              libcfs_nidstr(&ni->ni_nid), ni_credits, distance,
+                       CDEBUG(D_NET, "compare ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n",
+                              libcfs_nidstr(&ni->ni_nid),
+                              ni_fatal ? "y" : "n", ni_credits, distance,
                               ni->ni_seq, ni_sel_prio, ni_dev_prio, ni_healthv,
                               (best_ni) ? libcfs_nidstr(&best_ni->ni_nid)
-                              : "not selected", best_credits, shortest_distance,
+                              : "not selected",
+                              best_ni_fatal ? "y" : "n", best_credits,
+                              shortest_distance,
                               (best_ni) ? best_ni->ni_seq : 0,
                               best_sel_prio, best_dev_prio, best_healthv);
                else
                        goto select_ni;
 
+               if (ni_fatal && !best_ni_fatal)
+                       continue;
+               else if (!ni_fatal && best_ni_fatal)
+                       goto select_ni;
+
                if (ni_healthv < best_healthv)
                        continue;
                else if (ni_healthv > best_healthv)
@@ -1750,6 +1758,7 @@ select_ni:
                best_healthv = ni_healthv;
                best_ni = ni;
                best_credits = ni_credits;
+               best_ni_fatal = ni_fatal;
        }
 
        CDEBUG(D_NET, "selected best_ni %s\n",