From d29c0438bdf38e89d5638030b3770d7740121f8d Mon Sep 17 00:00:00 2001 From: Vitaly Fertman Date: Mon, 29 Sep 2014 19:42:32 -0400 Subject: [PATCH] LU-5579 ldlm: re-sent enqueue vs lock destroy race upon lock enqueue re-send, lock is pinned by ldlm_handle_enqueue0, however it may race with client eviction or even lcok cancel (if a reply for the original RPC finally reached the client) and the lock cann be found by cookie anymore: ASSERTION( lock != NULL ) failed: Invalid lock handle Signed-off-by: Vitaly Fertman Change-Id: I9d8156bf78a1b83ac22ffaa1148feb43bef37b1a Xyratex-bug-id: MRP-2094 --- lustre/mdt/mdt_handler.c | 14 ++++++++++---- lustre/target/tgt_handler.c | 1 + lustre/tests/recovery-small.sh | 30 ++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 3fcb290..79da610 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -1349,7 +1349,7 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info, rc = mdt_check_resent_lock(info, child, lhc); if (rc < 0) { - RETURN(-EPROTO); + RETURN(rc); } else if (rc > 0) { mdt_lock_handle_init(lhc); mdt_lock_reg_init(lhc, LCK_PR); @@ -1432,13 +1432,13 @@ static int mdt_getattr_name_lock(struct mdt_thread_info *info, if (unlikely(IS_ERR(child))) GOTO(out_parent, rc = PTR_ERR(child)); + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RESEND, obd_timeout*2); rc = mdt_check_resent_lock(info, child, lhc); if (rc < 0) { GOTO(out_child, rc); } else if (rc > 0) { bool try_layout = false; - OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RESEND, obd_timeout*2); mdt_lock_handle_init(lhc); mdt_lock_reg_init(lhc, LCK_PR); @@ -2627,8 +2627,14 @@ int mdt_check_resent_lock(struct mdt_thread_info *info, lock = ldlm_handle2lock(&lhc->mlh_reg_lh); LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT); - LASSERTF(lock != NULL, "Invalid lock handle "LPX64"\n", - lhc->mlh_reg_lh.cookie); + if (lock == NULL) { + /* Lock is pinned by ldlm_handle_enqueue0() as it is + * a resend case, however, it could be already destroyed + * due to client eviction or a raced cancel RPC. */ + LDLM_DEBUG_NOLOCK("Invalid lock handle "LPX64"\n", + lhc->mlh_reg_lh.cookie); + RETURN(-ESTALE); + } if (!fid_res_name_eq(mdt_object_fid(mo), &lock->l_resource->lr_name)) { diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index c5a9fde..241a8e2 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -793,6 +793,7 @@ int tgt_enqueue(struct tgt_session_info *tsi) if (rc) RETURN(err_serious(rc)); + tsi->tsi_reply_fail_id = OBD_FAIL_LDLM_REPLY; RETURN(req->rq_status); } EXPORT_SYMBOL(tgt_enqueue); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index c11e009..d0c63b8 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -1284,6 +1284,36 @@ run_test 61 "Verify to not reuse orphan objects - bug 17025" #} #run_test 62 "Verify connection flags race - bug LU-1716" +test_66() +{ + local list=$(comma_list $(osts_nodes)) + + # modify dir so that next revalidate would not obtain UPDATE lock + touch $DIR + + # drop 1 reply with UPDATE lock + mcreate $DIR/$tfile || error "mcreate failed: $?" + drop_ldlm_reply_once "stat $DIR/$tfile" & + sleep 2 + + # make the re-sent lock to sleep +#define OBD_FAIL_MDS_RESEND 0x136 + do_nodes $list lctl set_param fail_loc=0x80000136 + + #initiate the re-connect & re-send + mdccli=$($LCTL dl | awk '/-mdc-/ {print $4;}') + conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid) + $LCTL set_param "mdc.${mdccli}.import=connection=${conn_uuid}" + sleep 2 + + #initiate the client eviction while enqueue re-send is in progress + mds_evict_client + + client_reconnect + wait +} +run_test 66 "lock enqueue re-send vs client eviction" + check_cli_ir_state() { local NODE=${1:-$HOSTNAME} -- 1.8.3.1