From d25270ce5cb4279514ebc117a3e798d09a8105f7 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Tue, 24 Aug 2021 13:48:41 -0700 Subject: [PATCH] LU-14955 lnet: Use fatal NI if none other available Allow NI in fatal state to be selected for sending if there are no NIs in non-fatal state. Lustre-change: https://review.whamcloud.com/44746/ Lustre-commit: ff3322fd0c77a8042558711d9f410326d2aa6375 Test-Parameters: trivial testlist=sanity-lnet HPE-bug-id: LUS-11019 Signed-off-by: Serguei Smirnov Signed-off-by: Chris Horn Change-Id: Iab8ef6ee5c5f45896196dbd88a2f61e004278297 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53153 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- lnet/lnet/lib-move.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 1b50525..f00658f 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1609,6 +1609,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, int best_healthv; __u32 best_sel_prio; unsigned int best_dev_prio; + int best_ni_fatal; unsigned int dev_idx = UINT_MAX; struct page *page = lnet_get_first_page(md, offset); msg->msg_rdma_force = lnet_is_rdma_only_page(page); @@ -1629,6 +1630,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, best_dev_prio = UINT_MAX; best_credits = INT_MIN; best_healthv = 0; + best_ni_fatal = true; } else { best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx); shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt, @@ -1636,6 +1638,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, best_credits = atomic_read(&best_ni->ni_tx_credits); best_healthv = atomic_read(&best_ni->ni_healthv); best_sel_prio = best_ni->ni_sel_priority; + best_ni_fatal = atomic_read(&best_ni->ni_fatal_error_on); } while ((ni = lnet_get_next_ni_locked(local_net, ni))) { @@ -1673,20 +1676,24 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, * Select on health, shorter distance, available * credits, then round-robin. */ - if (ni_fatal) - continue; - if (best_ni) - CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n", - libcfs_nid2str(ni->ni_nid), ni_credits, distance, + CDEBUG(D_NET, "compare ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d] with best_ni %s [f:%s, c:%d, d:%d, s:%d, p:%u, g:%u, h:%d]\n", + libcfs_nid2str(ni->ni_nid), + ni_fatal ? "y" : "n", ni_credits, distance, ni->ni_seq, ni_sel_prio, ni_dev_prio, ni_healthv, (best_ni) ? libcfs_nid2str(best_ni->ni_nid) - : "not selected", best_credits, shortest_distance, + : "not selected", + best_ni_fatal ? "y" : "n", best_credits, shortest_distance, (best_ni) ? best_ni->ni_seq : 0, best_sel_prio, best_dev_prio, best_healthv); else goto select_ni; + if (ni_fatal && !best_ni_fatal) + continue; + else if (!ni_fatal && best_ni_fatal) + goto select_ni; + if (ni_healthv < best_healthv) continue; else if (ni_healthv > best_healthv) @@ -1722,6 +1729,7 @@ select_ni: best_healthv = ni_healthv; best_ni = ni; best_credits = ni_credits; + best_ni_fatal = ni_fatal; } CDEBUG(D_NET, "selected best_ni %s\n", -- 1.8.3.1