Whamcloud - gitweb
LU-5579 ldlm: re-sent enqueue vs lock destroy race
authorVitaly Fertman <vitaly_fertman@xyratex.com>
Mon, 29 Sep 2014 23:42:32 +0000 (19:42 -0400)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 7 Oct 2014 15:44:26 +0000 (11:44 -0400)
upon lock enqueue re-send, lock is pinned by ldlm_handle_enqueue0,
however it may race with client eviction or even lcok cancel (if
a reply for the original RPC finally reached the client) and the
lock cann be found by cookie anymore:

 ASSERTION( lock != NULL ) failed: Invalid lock handle

Signed-off-by: Vitaly Fertman <vitaly_fertman@xyratex.com>
Change-Id: I9d8156bf78a1b83ac22ffaa1148feb43bef37b1a
Xyratex-bug-id: MRP-2094

lustre/mdt/mdt_handler.c
lustre/target/tgt_handler.c
lustre/tests/recovery-small.sh

index 3fcb290..79da610 100644 (file)
@@ -1349,7 +1349,7 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info,
 
                rc = mdt_check_resent_lock(info, child, lhc);
                if (rc < 0) {
-                       RETURN(-EPROTO);
+                       RETURN(rc);
                } else if (rc > 0) {
                         mdt_lock_handle_init(lhc);
                         mdt_lock_reg_init(lhc, LCK_PR);
@@ -1432,13 +1432,13 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info,
        if (unlikely(IS_ERR(child)))
                GOTO(out_parent, rc = PTR_ERR(child));
 
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RESEND, obd_timeout*2);
        rc = mdt_check_resent_lock(info, child, lhc);
        if (rc < 0) {
                GOTO(out_child, rc);
        } else if (rc > 0) {
                bool try_layout = false;
 
-               OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RESEND, obd_timeout*2);
                mdt_lock_handle_init(lhc);
                mdt_lock_reg_init(lhc, LCK_PR);
 
@@ -2627,8 +2627,14 @@ int mdt_check_resent_lock(struct mdt_thread_info *info,
 
                lock = ldlm_handle2lock(&lhc->mlh_reg_lh);
                LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT);
-               LASSERTF(lock != NULL, "Invalid lock handle "LPX64"\n",
-                        lhc->mlh_reg_lh.cookie);
+               if (lock == NULL) {
+                       /* Lock is pinned by ldlm_handle_enqueue0() as it is
+                        * a resend case, however, it could be already destroyed
+                        * due to client eviction or a raced cancel RPC. */
+                       LDLM_DEBUG_NOLOCK("Invalid lock handle "LPX64"\n",
+                                         lhc->mlh_reg_lh.cookie);
+                       RETURN(-ESTALE);
+               }
 
                if (!fid_res_name_eq(mdt_object_fid(mo),
                                     &lock->l_resource->lr_name)) {
index c5a9fde..241a8e2 100644 (file)
@@ -793,6 +793,7 @@ int tgt_enqueue(struct tgt_session_info *tsi)
        if (rc)
                RETURN(err_serious(rc));
 
+       tsi->tsi_reply_fail_id = OBD_FAIL_LDLM_REPLY;
        RETURN(req->rq_status);
 }
 EXPORT_SYMBOL(tgt_enqueue);
index c11e009..d0c63b8 100755 (executable)
@@ -1284,6 +1284,36 @@ run_test 61 "Verify to not reuse orphan objects - bug 17025"
 #}
 #run_test 62 "Verify connection flags race - bug LU-1716"
 
+test_66()
+{
+       local list=$(comma_list $(osts_nodes))
+
+       # modify dir so that next revalidate would not obtain UPDATE lock
+       touch $DIR
+
+       # drop 1 reply with UPDATE lock
+       mcreate $DIR/$tfile || error "mcreate failed: $?"
+       drop_ldlm_reply_once "stat $DIR/$tfile" &
+       sleep 2
+
+       # make the re-sent lock to sleep
+#define OBD_FAIL_MDS_RESEND              0x136
+       do_nodes $list lctl set_param fail_loc=0x80000136
+
+       #initiate the re-connect & re-send
+       mdccli=$($LCTL dl | awk '/-mdc-/ {print $4;}')
+       conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid)
+       $LCTL set_param "mdc.${mdccli}.import=connection=${conn_uuid}"
+       sleep 2
+
+       #initiate the client eviction while enqueue re-send is in progress
+       mds_evict_client
+
+       client_reconnect
+       wait
+}
+run_test 66 "lock enqueue re-send vs client eviction"
+
 check_cli_ir_state()
 {
         local NODE=${1:-$HOSTNAME}