Whamcloud - gitweb
LU-18208 lnet: Server VM crashed: unable to handle 22/56322/3
authorFrank Sehr <fsehr@whamcloud.com>
Tue, 10 Sep 2024 23:30:21 +0000 (16:30 -0700)
committerOleg Drokin <green@whamcloud.com>
Mon, 16 Sep 2024 15:13:11 +0000 (15:13 +0000)
Revert "LU-18160 lnet: ensure lnetctl ping completes in a finite time"
t seems like the patch for LU-18160 introduced crashes. Maybe the
change from wait_for_complete from timeout to interupt. Reverting that
patch solved the problem.

This reverts commit 1666840bb06bbeeb35b2f9a51f9235c36886a3c6.

Test-Parameters: trivial testlist=sanity
Signed-off-by: Frank Sehr <fsehr@whamcloud.com>
Change-Id: Ie48185eb973eee65df2810d7acf940cf6981b83e
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56322
Reviewed-by: Timothy Day <timday@amazon.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
lnet/lnet/api-ni.c

index 2053f1b..91f993f 100644 (file)
@@ -10019,6 +10019,7 @@ static int lnet_ping(struct lnet_processid *id, struct lnet_nid *src_nid,
        u32 *st;
        int nob;
        int rc;
+       int rc2;
 
        genradix_init(&plist->lgpl_list);
 
@@ -10056,22 +10057,27 @@ static int lnet_ping(struct lnet_processid *id, struct lnet_nid *src_nid,
        init_completion(&pd.completion);
 
        rc = LNetMDBind(&md, LNET_UNLINK, &pd.mdh);
-       if (rc) {
+       if (rc != 0) {
                CERROR("Can't bind MD: %d\n", rc);
                goto fail_ping_buffer_decref;
        }
 
        rc = LNetGet(src_nid, pd.mdh, id, LNET_RESERVED_PORTAL,
                     LNET_PROTO_PING_MATCHBITS, 0, false);
-       if (rc)
-               LASSERT(!LNetMDUnlink(pd.mdh));
+       if (rc != 0) {
+               /* Don't CERROR; this could be deliberate! */
+               rc2 = LNetMDUnlink(pd.mdh);
+               LASSERT(rc2 == 0);
 
-       /* Ensure completion in finite time... */
-       wait_for_completion_interruptible_timeout(&pd.completion,
-                                                 timeout);
+               /* NB must wait for the UNLINK event below... */
+       }
 
-       if (!pd.pd_unlinked)
+       /* Ensure completion in finite time... */
+       wait_for_completion_timeout(&pd.completion, timeout);
+       if (!pd.pd_unlinked) {
                LNetMDUnlink(pd.mdh);
+               wait_for_completion(&pd.completion);
+       }
 
        if (!pd.replied) {
                rc = pd.rc ?: -EIO;