From 7fff052c930da4822c3b2a13d130da7473a20a58 Mon Sep 17 00:00:00 2001 From: Andriy Skulysh Date: Tue, 3 May 2016 10:41:56 +0300 Subject: [PATCH] LU-7791 ldlm: signal vs CP callback race In case of interrupted wait for a CP AST failed_lock_cleanup() sets LDLM_FL_LOCAL_ONLY, so the client wouldn't cancel the lock on CP AST. A lock isn't canceled on the server on reception Cray-bug-id: LUS-2021 Change-Id: Id1e365b41f1fb8a0f9a32c0c929457b22ceba8ef Signed-off-by: Andriy Skulysh Reviewed-by: Alexander Boyko Reviewed-by: Andrew Perepechko Reviewed-on: https://review.whamcloud.com/19898 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexandr Boyko Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_lockd.c | 51 +++++++++++++++++++++++++++----------------- lustre/ldlm/ldlm_request.c | 3 +++ lustre/tests/sanityn.sh | 36 +++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+), 20 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index a728bef..5cf86de 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -386,6 +386,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LDLM_SRV_GL_AST 0x326 #define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 #define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a #define OBD_FAIL_LDLM_PROLONG_PAUSE 0x32b diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index e208689..faf7545 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1884,15 +1884,29 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns, EXIT; } +static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) +{ + if (req->rq_no_reply) + return 0; + + req->rq_status = rc; + if (!req->rq_packed_final) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + return rc; + } + return ptlrpc_reply(req); +} + /** * Callback handler for receiving incoming completion ASTs. * * This only can happen on client side. */ -static void ldlm_handle_cp_callback(struct ptlrpc_request *req, - struct ldlm_namespace *ns, - struct ldlm_request *dlm_req, - struct ldlm_lock *lock) +static int ldlm_handle_cp_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) { struct list_head ast_list; int lvb_len; @@ -1906,6 +1920,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { long to = cfs_time_seconds(1); + ldlm_callback_reply(req, 0); + while (to > 0) { set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(to); @@ -1949,6 +1965,12 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, lock_res_and_lock(lock); } + if (ldlm_is_failed(lock)) { + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(-EINVAL); + } + if (ldlm_is_destroyed(lock) || ldlm_is_granted(lock)) { /* b=11300: the lock has already been granted */ @@ -2021,6 +2043,8 @@ out: wake_up(&lock->l_waitq); } LDLM_LOCK_RELEASE(lock); + + return 0; } /** @@ -2077,20 +2101,6 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req, EXIT; } -static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) -{ - if (req->rq_no_reply) - return 0; - - req->rq_status = rc; - if (!req->rq_packed_final) { - rc = lustre_pack_reply(req, 1, NULL, NULL); - if (rc) - return rc; - } - return ptlrpc_reply(req); -} - static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, enum ldlm_cancel_flags cancel_flags) { @@ -2426,8 +2436,9 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) case LDLM_CP_CALLBACK: CDEBUG(D_INODE, "completion ast\n"); req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); - ldlm_callback_reply(req, 0); - ldlm_handle_cp_callback(req, ns, dlm_req, lock); + rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock); + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) + ldlm_callback_reply(req, rc); break; case LDLM_GL_CALLBACK: CDEBUG(D_INODE, "glimpse ast\n"); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 691f10c..143d0a9 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -1226,6 +1226,9 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) bool local_only; LDLM_DEBUG(lock, "client-side cancel"); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL, + cfs_fail_val); + /* Set this flag to prevent others from getting new references*/ lock_res_and_lock(lock); ldlm_set_cbpending(lock); diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 6be8d80..72a3971 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -4514,6 +4514,42 @@ test_93() { } run_test 93 "alloc_rr should not allocate on same ost" +test_94() { + $LCTL set_param osc.*.idle_timeout=0 + dd if=/dev/zero of=$DIR2/$tfile bs=4k count=2 conv=fsync + + local before=$(date +%s) + local evict + + $LCTL mark write +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 + $LCTL set_param fail_val=5 fail_loc=0x80000312 + dd if=/dev/zero of=$DIR/$tfile conv=notrunc oflag=append bs=4k count=1 & + local pid=$! + sleep 2 + +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 + $LCTL set_param fail_val=6 fail_loc=0x80000329 + $LCTL mark kill $pid + kill -ALRM $pid + + dd if=/dev/zero of=$DIR2/$tfile conv=notrunc oflag=append bs=4k count=1 + + wait $pid + dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 conv=fsync + + evict=$(do_facet client $LCTL get_param \ + osc.$FSNAME-OST*-osc-*/state | + awk -F"[ [,]" '/EVICTED ]$/ { if (t<$5) {t=$5;} } END { print t }') + + [ -z "$evict" ] || [[ $evict -le $before ]] || + (do_facet client $LCTL get_param \ + osc.$FSNAME-OST*-osc-*/state; + error "eviction happened: $evict before:$before") + $LCTL set_param osc.*.idle_timeout=debug +} +run_test 94 "signal vs CP callback race" + # Data-on-MDT tests test_100a() { skip "Reserved for glimpse-ahead" && return -- 1.8.3.1