From a880f385164547837a34df92acd6cdc8e9a9698d Mon Sep 17 00:00:00 2001 From: Andriy Skulysh Date: Wed, 2 Mar 2016 00:15:55 +0200 Subject: [PATCH] LU-7791 ldlm: signal vs CP callback race a lock isn't canceled on the server on reception of a signal on the client when it is blocked waiting for a lock to be granted, thus client is evicted for not responding to LDLM_FL_AST_SENT. Seagate-bug-id: MRP-3314 Change-Id: Ie025bb58b13fb2d741119f74c87439f917983268 Signed-off-by: Andriy Skulysh Reviewed-on: http://review.whamcloud.com/18498 Tested-by: Jenkins Reviewed-by: Vitaly Fertman Tested-by: Maloo Reviewed-by: Chris Horn Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_lockd.c | 17 ++++++++++++----- lustre/ldlm/ldlm_request.c | 5 ++++- lustre/tests/sanityn.sh | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 971f8e3..919be18 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -366,6 +366,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LDLM_SRV_GL_AST 0x326 #define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 #define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 /* LOCKLESS IO */ #define OBD_FAIL_LDLM_SET_CONTENTION 0x385 diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index e848e5c..42d76b8 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1793,7 +1793,7 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns, * * This only can happen on client side. */ -static void ldlm_handle_cp_callback(struct ptlrpc_request *req, +static int ldlm_handle_cp_callback(struct ptlrpc_request *req, struct ldlm_namespace *ns, struct ldlm_request *dlm_req, struct ldlm_lock *lock) @@ -1837,6 +1837,11 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, } lock_res_and_lock(lock); + if (ldlm_is_failed(lock)) { + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(-EINVAL); + } if (ldlm_is_destroyed(lock) || lock->l_granted_mode == lock->l_req_mode) { /* bug 11300: the lock has already been granted */ @@ -1916,6 +1921,8 @@ out: wake_up(&lock->l_waitq); } LDLM_LOCK_RELEASE(lock); + + return 0; } /** @@ -2322,10 +2329,10 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); break; case LDLM_CP_CALLBACK: - CDEBUG(D_INODE, "completion ast\n"); - req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); - ldlm_callback_reply(req, 0); - ldlm_handle_cp_callback(req, ns, dlm_req, lock); + CDEBUG(D_INODE, "completion ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); + rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock); + ldlm_callback_reply(req, rc); break; case LDLM_GL_CALLBACK: CDEBUG(D_INODE, "glimpse ast\n"); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 1e2a6cd..1d61fed 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -1110,7 +1110,10 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) if (lock->l_conn_export) { bool local_only; - LDLM_DEBUG(lock, "client-side cancel"); + LDLM_DEBUG(lock, "client-side cancel"); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL, + cfs_fail_val); + /* Set this flag to prevent others from getting new references*/ lock_res_and_lock(lock); ldlm_set_cbpending(lock); diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 096bebc..6b00d2f 100644 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -3567,6 +3567,40 @@ test_92() { } run_test 92 "create remote directory under orphan directory" +test_93() { + dd if=/dev/zero of=$DIR2/$tfile bs=4k count=2 conv=fsync + + local before=$(date +%s) + local evict + + $LCTL mark write +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 + $LCTL set_param fail_val=5 fail_loc=0x80000312 + dd if=/dev/zero of=$DIR/$tfile conv=notrunc oflag=append bs=4k count=1 & + local pid=$! + sleep 2 + +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 + $LCTL set_param fail_val=6 fail_loc=0x80000329 + $LCTL mark kill $pid + kill -ALRM $pid + + dd if=/dev/zero of=$DIR2/$tfile conv=notrunc oflag=append bs=4k count=1 + + wait $pid + dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 conv=fsync + + evict=$(do_facet client $LCTL get_param \ + osc.$FSNAME-OST*-osc-*/state | \ + awk -F"[ [,]" '/EVICTED ]$/ { if (t<$5) {t=$5;} } END { print t }') + + [ -z "$evict" ] || [[ $evict -le $before ]] || + (do_facet client $LCTL get_param \ + osc.$FSNAME-OST*-osc-*/state; + error "eviction happened: $evict before:$before") +} +run_test 93 "signal vs CP callback race" + log "cleanup: ======================================================" # kill and wait in each test only guarentee script finish, but command in script -- 1.8.3.1