Whamcloud - gitweb
LU-16536 osp: don't cleanup ldlm in precleanup phase 25/49925/6
authorAlex Zhuravlev <bzzz@whamcloud.com>
Tue, 7 Feb 2023 09:29:24 +0000 (12:29 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 14 Feb 2023 06:07:04 +0000 (06:07 +0000)
instead do this in cleanup phase so that all OSPs have chance
to abort in-flight RPCs which can block MDT thread holding
LDLM locks.

Fixes: 226fd401f9 ("LU-7660 dne: support fs default stripe")
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: Ib3714b29c514a7fa938d47717dc36525654407d6
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49925
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osp/osp_dev.c
lustre/target/out_handler.c
lustre/tests/replay-single.sh

index 395118e..923a1f5 100644 (file)
@@ -691,6 +691,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OUT_OBJECT_MISS       0x1708
 #define OBD_FAIL_OUT_EIO               0x1709
 #define OBD_FAIL_BUT_UPDATE_NET_REP    0x170a
+#define OBD_FAIL_OUT_DROP_DESTROY      0x170b
 
 /* MIGRATE */
 #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
index 95c196e..5497028 100644 (file)
@@ -662,10 +662,15 @@ static int osp_process_config(const struct lu_env *env,
        case LCFG_PRE_CLEANUP:
                rc = osp_disconnect(d);
                osp_update_fini(env, d);
-               if (obd->obd_namespace != NULL)
-                       ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1);
                break;
        case LCFG_CLEANUP:
+               /*
+                * cleanup ldlm so that PRE_CLEANUP phase doesn't block
+                * awaiting for locks held by MDT threads awaiting for
+                * all OSPs to interrupt their in-flight RPCs
+                */
+               if (obd->obd_namespace != NULL)
+                       ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1);
                lu_dev_del_linkage(dev->ld_site, dev);
                rc = osp_shutdown(env, d);
                break;
index c293050..49accc7 100644 (file)
@@ -589,6 +589,9 @@ static int out_destroy(struct tgt_session_info *tsi)
                            tti->tti_u.update.tti_update_reply,
                            tti->tti_u.update.tti_update_reply_index);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_OUT_DROP_DESTROY))
+               tsi->tsi_pill->rc_req->rq_no_reply = 1;
+
        RETURN(rc);
 }
 
index 2240e8e..a156b79 100755 (executable)
@@ -4997,6 +4997,29 @@ test_135() {
 }
 run_test 135 "Server failure in lock replay phase"
 
+test_136() {
+       (( $MDSCOUNT >= 3 )) || skip "needs > 2 MDTs"
+       (( MDS1_VERSION >= $(version_code 2.15.53) )) ||
+               skip "need MDS version >= 2.15.53 for LU-16536 fix"
+
+       $LFS mkdir -i0 -c3 $DIR/$tdir || error "can't mkdir"
+       $LFS getdirstripe $DIR/$tdir
+       sync;sync;sync
+
+#define OBD_FAIL_OUT_DROP_DESTROY      0x170b
+       local mdts=$(comma_list $(mdts_nodes))
+       do_nodes $mdts $LCTL set_param fail_loc=0x170b
+       rmdir $DIR/$tdir &
+       sleep 0.5
+       stop mds2
+       stop mds3
+       stop mds1
+       start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || error "MDT1 start failed"
+       start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || error "MDT2 start failed"
+       start mds3 $(mdsdevname 3) $MDS_MOUNT_OPTS || error "MDT3 star"
+}
+run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status