From 8f4336481a482544f54b7e8a9d63340a57062d68 Mon Sep 17 00:00:00 2001 From: Andriy Skulysh Date: Tue, 3 May 2016 10:41:56 +0300 Subject: [PATCH] LU-7791 ldlm: signal vs CP callback race In case of interrupted wait for a CP AST failed_lock_cleanup() sets LDLM_FL_LOCAL_ONLY, so the client wouldn't cancel the lock on CP AST. A lock isn't canceled on the server on reception Lustre-change: https://review.whamcloud.com/19898 Lustre-commit: 7fff052c930da4822c3b2a13d130da7473a20a58 Cray-bug-id: LUS-2021 Change-Id: Id1e365b41f1fb8a0f9a32c0c929457b22ceba8ef Signed-off-by: Andriy Skulysh Reviewed-by: Alexander Boyko Reviewed-by: Andrew Perepechko Reviewed-by: Alexandr Boyko Reviewed-on: https://review.whamcloud.com/44297 Reviewed-by: Patrick Farrell Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_lockd.c | 87 +++++++++++++++++++++++++------------------- lustre/ldlm/ldlm_request.c | 9 +++-- lustre/tests/sanityn.sh | 36 ++++++++++++++++++ 4 files changed, 92 insertions(+), 41 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 32121b5..b5aed43 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -391,6 +391,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LDLM_SRV_GL_AST 0x326 #define OBD_FAIL_LDLM_WATERMARK_LOW 0x327 #define OBD_FAIL_LDLM_WATERMARK_HIGH 0x328 +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a #define OBD_FAIL_LDLM_PROLONG_PAUSE 0x32b diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index bb45f4f..137ba42 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1802,12 +1802,26 @@ void ldlm_handle_bl_callback(struct ldlm_namespace *ns, EXIT; } +static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) +{ + if (req->rq_no_reply) + return 0; + + req->rq_status = rc; + if (!req->rq_packed_final) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + return rc; + } + return ptlrpc_reply(req); +} + /** * Callback handler for receiving incoming completion ASTs. * * This only can happen on client side. */ -static void ldlm_handle_cp_callback(struct ptlrpc_request *req, +static int ldlm_handle_cp_callback(struct ptlrpc_request *req, struct ldlm_namespace *ns, struct ldlm_request *dlm_req, struct ldlm_lock *lock) @@ -1823,6 +1837,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { long to = cfs_time_seconds(1); + ldlm_callback_reply(req, 0); + while (to > 0) { set_current_state(TASK_INTERRUPTIBLE); to = schedule_timeout(to); @@ -1867,6 +1883,12 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, lock_res_and_lock(lock); } + if (ldlm_is_failed(lock)) { + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + RETURN(-EINVAL); + } + if (ldlm_is_destroyed(lock) || ldlm_is_granted(lock)) { /* bug 11300: the lock has already been granted */ @@ -1935,6 +1957,8 @@ out: wake_up(&lock->l_waitq); } LDLM_LOCK_RELEASE(lock); + + return 0; } /** @@ -1991,20 +2015,6 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req, EXIT; } -static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) -{ - if (req->rq_no_reply) - return 0; - - req->rq_status = rc; - if (!req->rq_packed_final) { - rc = lustre_pack_reply(req, 1, NULL, NULL); - if (rc) - return rc; - } - return ptlrpc_reply(req); -} - static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, enum ldlm_cancel_flags cancel_flags) { @@ -2311,30 +2321,31 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) CDEBUG(D_INODE, "blocking ast\n"); req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK); if (!ldlm_is_cancel_on_block(lock)) { - rc = ldlm_callback_reply(req, 0); - if (req->rq_no_reply || rc) - ldlm_callback_errmsg(req, "Normal process", rc, - &dlm_req->lock_handle[0]); - } - if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock)) - ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); - break; - case LDLM_CP_CALLBACK: - CDEBUG(D_INODE, "completion ast\n"); - req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); - ldlm_callback_reply(req, 0); - ldlm_handle_cp_callback(req, ns, dlm_req, lock); - break; - case LDLM_GL_CALLBACK: - CDEBUG(D_INODE, "glimpse ast\n"); - req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); - ldlm_handle_gl_callback(req, ns, dlm_req, lock); - break; - default: - LBUG(); /* checked above */ - } + rc = ldlm_callback_reply(req, 0); + if (req->rq_no_reply || rc) + ldlm_callback_errmsg(req, "Normal process", rc, + &dlm_req->lock_handle[0]); + } + if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock)) + ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); + break; + case LDLM_CP_CALLBACK: + CDEBUG(D_INODE, "completion ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); + rc = ldlm_handle_cp_callback(req, ns, dlm_req, lock); + if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) + ldlm_callback_reply(req, rc); + break; + case LDLM_GL_CALLBACK: + CDEBUG(D_INODE, "glimpse ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); + ldlm_handle_gl_callback(req, ns, dlm_req, lock); + break; + default: + LBUG(); /* checked above */ + } - RETURN(0); + RETURN(0); } #ifdef HAVE_SERVER_SUPPORT diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 69736ab..a761407 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -1147,9 +1147,12 @@ static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) if (lock->l_conn_export) { bool local_only; - LDLM_DEBUG(lock, "client-side cancel"); - /* Set this flag to prevent others from getting new references*/ - lock_res_and_lock(lock); + LDLM_DEBUG(lock, "client-side cancel"); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL, + cfs_fail_val); + + /* Set this flag to prevent others from getting new references*/ + lock_res_and_lock(lock); ldlm_set_cbpending(lock); local_only = !!(lock->l_flags & (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK)); diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index c8847e7..3e17f99 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -4612,6 +4612,42 @@ test_93() { } run_test 93 "alloc_rr should not allocate on same ost" +test_94() { + $LCTL set_param osc.*.idle_timeout=0 + dd if=/dev/zero of=$DIR2/$tfile bs=4k count=2 conv=fsync + + local before=$(date +%s) + local evict + + $LCTL mark write +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 + $LCTL set_param fail_val=5 fail_loc=0x80000312 + dd if=/dev/zero of=$DIR/$tfile conv=notrunc oflag=append bs=4k count=1 & + local pid=$! + sleep 2 + +#define OBD_FAIL_LDLM_PAUSE_CANCEL_LOCAL 0x329 + $LCTL set_param fail_val=6 fail_loc=0x80000329 + $LCTL mark kill $pid + kill -ALRM $pid + + dd if=/dev/zero of=$DIR2/$tfile conv=notrunc oflag=append bs=4k count=1 + + wait $pid + dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 conv=fsync + + evict=$(do_facet client $LCTL get_param \ + osc.$FSNAME-OST*-osc-*/state | + awk -F"[ [,]" '/EVICTED ]$/ { if (t<$5) {t=$5;} } END { print t }') + + [ -z "$evict" ] || [[ $evict -le $before ]] || + (do_facet client $LCTL get_param \ + osc.$FSNAME-OST*-osc-*/state; + error "eviction happened: $evict before:$before") + $LCTL set_param osc.*.idle_timeout=debug +} +run_test 94 "signal vs CP callback race" + # Data-on-MDT tests test_100a() { skip "Reserved for glimpse-ahead" && return -- 1.8.3.1