From 3be9beb00e7cca9d758b364a85537b0631dddc12 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Wed, 15 Jun 2016 14:12:14 +0800 Subject: [PATCH] LU-7702 ldlm: skip lock if export failed This patch resolves IO vs eviction race. After eviction failed export stayed at stale list, a client had IO processing and reconnected during it. A client sent brw rpc with last lock cookie and new connection. The lock with failed export was found and assert was happened. (ost_handler.c:1812:ost_prolong_lock_one()) ASSERTION( lock->l_export == opd->opd_exp ) failed: 1. Skip the lock at ldlm_handle2lock if lock export failed. 2. Validation of lock for IO was added at hpreq_check(). The lock searching is based on granted interval tree. If server doesn`t have a valid lock, it reply to client with ESTALE. 3. A lock prolong was moved at ldlm layer. Signed-off-by: Alexander Boyko Seagate-bug-id: MRP-2787 Change-Id: I2177736564c3b8164f1ad3e4cc02dca1704a0e6e Reviewed-on: http://review.whamcloud.com/18120 Reviewed-by: Fan Yong Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Vitaly Fertman Reviewed-by: Oleg Drokin --- lustre/include/lustre_dlm.h | 13 +++ lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_extent.c | 106 ++++++++++++++++++++++++ lustre/ldlm/ldlm_lock.c | 7 ++ lustre/ofd/ofd_dev.c | 180 +++++++++++++++++------------------------ lustre/osc/osc_request.c | 1 + lustre/ptlrpc/service.c | 9 ++- lustre/tests/recovery-small.sh | 25 ++++++ 8 files changed, 233 insertions(+), 109 deletions(-) diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 0afbad7..4594387 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -1204,6 +1204,19 @@ int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); /* ldlm_extent.c */ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms); +struct ldlm_prolong_args { + struct obd_export *lpa_export; + struct ldlm_res_id lpa_resid; + struct ldlm_extent lpa_extent; + enum ldlm_mode lpa_mode; + int lpa_timeout; + int lpa_locks_cnt; + int lpa_blocks_cnt; +}; +void ldlm_lock_prolong_one(struct ldlm_lock *lock, + struct ldlm_prolong_args *arg); +void ldlm_resource_prolong(struct ldlm_prolong_args *arg); + struct ldlm_callback_suite { ldlm_completion_callback lcs_completion; ldlm_blocking_callback lcs_blocking; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 87dcb30..c1eedb5 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -392,6 +392,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSC_NO_GRANT 0x411 #define OBD_FAIL_OSC_DELAY_SETTIME 0x412 #define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413 +#define OBD_FAIL_OSC_DELAY_IO 0x414 #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index 34bfa76..ace668a 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -632,6 +632,112 @@ destroylock: } /** + * This function refresh eviction timer for cancelled lock. + * \param[in] lock ldlm lock for refresh + * \param[in] arg ldlm prolong arguments, timeout, export, extent + * and counter are used + */ +void ldlm_lock_prolong_one(struct ldlm_lock *lock, + struct ldlm_prolong_args *arg) +{ + int timeout; + + if (arg->lpa_export != lock->l_export || + lock->l_flags & LDLM_FL_DESTROYED) + /* ignore unrelated locks */ + return; + + arg->lpa_locks_cnt++; + + if (!(lock->l_flags & LDLM_FL_AST_SENT)) + /* ignore locks not being cancelled */ + return; + + /* We are in the middle of the process - BL AST is sent, CANCEL + * is ahead. Take half of BL AT + IO AT process time. + */ + timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1); + + LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout); + + arg->lpa_blocks_cnt++; + + /* OK. this is a possible lock the user holds doing I/O + * let's refresh eviction timer for it. + */ + ldlm_refresh_waiting_lock(lock, timeout); +} +EXPORT_SYMBOL(ldlm_lock_prolong_one); + +static enum interval_iter ldlm_resource_prolong_cb(struct interval_node *n, + void *data) +{ + struct ldlm_prolong_args *arg = data; + struct ldlm_interval *node = to_ldlm_interval(n); + struct ldlm_lock *lock; + + ENTRY; + + LASSERT(!list_empty(&node->li_group)); + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + ldlm_lock_prolong_one(lock, arg); + } + + RETURN(INTERVAL_ITER_CONT); +} + +/** + * Walk through granted tree and prolong locks if they overlaps extent. + * + * \param[in] arg prolong args + */ +void ldlm_resource_prolong(struct ldlm_prolong_args *arg) +{ + struct ldlm_interval_tree *tree; + struct ldlm_resource *res; + struct interval_node_extent ex = { .start = arg->lpa_extent.start, + .end = arg->lpa_extent.end }; + int idx; + + ENTRY; + + res = ldlm_resource_get(arg->lpa_export->exp_obd->obd_namespace, NULL, + &arg->lpa_resid, LDLM_EXTENT, 0); + if (IS_ERR(res)) { + CDEBUG(D_DLMTRACE, "Failed to get resource for resid "LPU64"/" + LPU64"\n", arg->lpa_resid.name[0], + arg->lpa_resid.name[1]); + RETURN_EXIT; + } + + lock_res(res); + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + if (tree->lit_root == NULL) /* empty tree, skipped */ + continue; + + /* There is no possibility to check for the groupID + * so all the group locks are considered as valid + * here, especially because the client is supposed + * to check it has such a lock before sending an RPC. + */ + if (!(tree->lit_mode & arg->lpa_mode)) + continue; + + interval_search(tree->lit_root, &ex, + ldlm_resource_prolong_cb, arg); + } + + unlock_res(res); + ldlm_resource_putref(res); + + EXIT; +} +EXPORT_SYMBOL(ldlm_resource_prolong); + + +/** * Discard all AST work items from list. * * If for whatever reason we do not want to send ASTs to conflicting locks diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 10f8e8f..3e9cab4 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -603,6 +603,13 @@ struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, if (lock == NULL) RETURN(NULL); + if (lock->l_export != NULL && lock->l_export->exp_failed) { + CDEBUG(D_INFO, "lock export failed: lock %p, exp %p\n", + lock, lock->l_export); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + /* It's unlikely but possible that someone marked the lock as * destroyed after we did handle2object on it */ if ((flags == 0) && !ldlm_is_destroyed(lock)) { diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 71ec9f7..05365a9 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -2266,59 +2266,14 @@ static int ofd_quotactl(struct tgt_session_info *tsi) * * \retval amount of time to extend the timeout with */ -static inline int prolong_timeout(struct ptlrpc_request *req, - struct ldlm_lock *lock) +static inline int prolong_timeout(struct ptlrpc_request *req) { struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; if (AT_OFF) return obd_timeout / 2; - /* We are in the middle of the process - BL AST is sent, CANCEL - is ahead. Take half of AT + IO process time. */ - return at_est2timeout(at_get(&svcpt->scp_at_estimate)) + - (ldlm_bl_timeout(lock) >> 1); -} - -/** - * Prolong single lock timeout. - * - * This is supplemental function to the ofd_prolong_locks(). It prolongs - * a single lock. - * - * \param[in] tsi target session environment for this request - * \param[in] lock LDLM lock to prolong - * \param[in] extent related extent - * \param[in] timeout timeout value to add - * - * \retval 0 if lock is not suitable for prolongation - * \retval 1 if lock was prolonged successfully - */ -static int ofd_prolong_one_lock(struct tgt_session_info *tsi, - struct ldlm_lock *lock, - struct ldlm_extent *extent) -{ - int timeout = prolong_timeout(tgt_ses_req(tsi), lock); - - if (lock->l_flags & LDLM_FL_DESTROYED) /* lock already cancelled */ - return 0; - - /* XXX: never try to grab resource lock here because we're inside - * exp_bl_list_lock; in ldlm_lockd.c to handle waiting list we take - * res lock and then exp_bl_list_lock. */ - - if (!(lock->l_flags & LDLM_FL_AST_SENT)) - /* ignore locks not being cancelled */ - return 0; - - LDLM_DEBUG(lock, "refreshed for req x"LPU64" ext("LPU64"->"LPU64") " - "to %ds.\n", tgt_ses_req(tsi)->rq_xid, extent->start, - extent->end, timeout); - - /* OK. this is a possible lock the user holds doing I/O - * let's refresh eviction timer for it */ - ldlm_refresh_waiting_lock(lock, timeout); - return 1; + return at_est2timeout(at_get(&svcpt->scp_at_estimate)); } /** @@ -2341,25 +2296,26 @@ static int ofd_prolong_one_lock(struct tgt_session_info *tsi, * request may cover multiple locks. * * \param[in] tsi target session environment for this request - * \param[in] start start of extent - * \param[in] end end of extent + * \param[in] data struct of data to prolong locks * - * \retval number of prolonged locks */ -static int ofd_prolong_extent_locks(struct tgt_session_info *tsi, - __u64 start, __u64 end) +static void ofd_prolong_extent_locks(struct tgt_session_info *tsi, + struct ldlm_prolong_args *data) { - struct obd_export *exp = tsi->tsi_exp; struct obdo *oa = &tsi->tsi_ost_body->oa; - struct ldlm_extent extent = { - .start = start, - .end = end - }; struct ldlm_lock *lock; - int lock_count = 0; ENTRY; + data->lpa_timeout = prolong_timeout(tgt_ses_req(tsi)); + data->lpa_export = tsi->tsi_exp; + data->lpa_resid = tsi->tsi_resid; + + CDEBUG(D_RPCTRACE, "Prolong locks for req %p with x"LPU64 + " ext("LPU64"->"LPU64")\n", tgt_ses_req(tsi), + tgt_ses_req(tsi)->rq_xid, data->lpa_extent.start, + data->lpa_extent.end); + if (oa->o_valid & OBD_MD_FLHANDLE) { /* mostly a request should be covered by only one lock, try * fast path. */ @@ -2367,42 +2323,21 @@ static int ofd_prolong_extent_locks(struct tgt_session_info *tsi, if (lock != NULL) { /* Fast path to check if the lock covers the whole IO * region exclusively. */ - if (lock->l_granted_mode == LCK_PW && - ldlm_extent_contain(&lock->l_policy_data.l_extent, - &extent)) { + if (ldlm_extent_contain(&lock->l_policy_data.l_extent, + &data->lpa_extent)) { /* bingo */ - LASSERT(lock->l_export == exp); - lock_count = ofd_prolong_one_lock(tsi, lock, - &extent); + LASSERT(lock->l_export == data->lpa_export); + ldlm_lock_prolong_one(lock, data); LDLM_LOCK_PUT(lock); - RETURN(lock_count); + RETURN_EXIT; } lock->l_last_used = cfs_time_current(); LDLM_LOCK_PUT(lock); } } - spin_lock_bh(&exp->exp_bl_list_lock); - list_for_each_entry(lock, &exp->exp_bl_list, l_exp_list) { - LASSERT(lock->l_flags & LDLM_FL_AST_SENT); - LASSERT(lock->l_resource->lr_type == LDLM_EXTENT); - - /* ignore waiting locks, no more granted locks in the list */ - if (lock->l_granted_mode != lock->l_req_mode) - break; - - if (!ldlm_res_eq(&tsi->tsi_resid, &lock->l_resource->lr_name)) - continue; - - if (!ldlm_extent_overlap(&lock->l_policy_data.l_extent, - &extent)) - continue; - - lock_count += ofd_prolong_one_lock(tsi, lock, &extent); - } - spin_unlock_bh(&exp->exp_bl_list_lock); - - RETURN(lock_count); + ldlm_resource_prolong(data); + EXIT; } /** @@ -2449,8 +2384,10 @@ static int ofd_rw_hpreq_lock_match(struct ptlrpc_request *req, if (!ostid_res_name_eq(&ioo->ioo_oid, &lock->l_resource->lr_name)) RETURN(0); - /* a bulk write can only hold a reference on a PW extent lock */ - mode = LCK_PW; + /* a bulk write can only hold a reference on a PW extent lock + * or GROUP lock. + */ + mode = LCK_PW | LCK_GROUP; if (opc == OST_READ) /* whereas a bulk read can be protected by either a PR or PW * extent lock */ @@ -2466,20 +2403,22 @@ static int ofd_rw_hpreq_lock_match(struct ptlrpc_request *req, * Implementation of ptlrpc_hpreq_ops::hpreq_lock_check for OFD RW requests. * * Check for whether the given PTLRPC request (\a req) is blocking - * an LDLM lock cancel. + * an LDLM lock cancel. Also checks whether the request is covered by an LDLM + * lock. * * \param[in] req the incoming request * * \retval 1 if \a req is blocking an LDLM lock cancel * \retval 0 if it is not + * \retval -ESTALE if lock is not found */ static int ofd_rw_hpreq_check(struct ptlrpc_request *req) { struct tgt_session_info *tsi; struct obd_ioobj *ioo; struct niobuf_remote *rnb; - __u64 start, end; - int lock_count; + int opc; + struct ldlm_prolong_args pa = { 0 }; ENTRY; @@ -2491,6 +2430,9 @@ static int ofd_rw_hpreq_check(struct ptlrpc_request *req) * Use LASSERT below because malformed RPCs should have * been filtered out in tgt_hpreq_handler(). */ + opc = lustre_msg_get_opc(req->rq_reqmsg); + LASSERT(opc == OST_READ || opc == OST_WRITE); + ioo = req_capsule_client_get(&req->rq_pill, &RMF_OBD_IOOBJ); LASSERT(ioo != NULL); @@ -2498,21 +2440,28 @@ static int ofd_rw_hpreq_check(struct ptlrpc_request *req) LASSERT(rnb != NULL); LASSERT(!(rnb->rnb_flags & OBD_BRW_SRVLOCK)); - start = rnb->rnb_offset; + pa.lpa_mode = LCK_PW | LCK_GROUP; + if (opc == OST_READ) + pa.lpa_mode |= LCK_PR; + + pa.lpa_extent.start = rnb->rnb_offset; rnb += ioo->ioo_bufcnt - 1; - end = rnb->rnb_offset + rnb->rnb_len - 1; + pa.lpa_extent.end = rnb->rnb_offset + rnb->rnb_len - 1; DEBUG_REQ(D_RPCTRACE, req, "%s %s: refresh rw locks: "DFID - " ("LPU64"->"LPU64")\n", - tgt_name(tsi->tsi_tgt), current->comm, - PFID(&tsi->tsi_fid), start, end); + " ("LPU64"->"LPU64")\n", tgt_name(tsi->tsi_tgt), + current->comm, PFID(&tsi->tsi_fid), pa.lpa_extent.start, + pa.lpa_extent.end); - lock_count = ofd_prolong_extent_locks(tsi, start, end); + ofd_prolong_extent_locks(tsi, &pa); CDEBUG(D_DLMTRACE, "%s: refreshed %u locks timeout for req %p.\n", - tgt_name(tsi->tsi_tgt), lock_count, req); + tgt_name(tsi->tsi_tgt), pa.lpa_blocks_cnt, req); - RETURN(lock_count > 0); + if (pa.lpa_blocks_cnt > 0) + RETURN(1); + + RETURN(pa.lpa_locks_cnt > 0 ? 0 : -ESTALE); } /** @@ -2582,18 +2531,22 @@ static int ofd_punch_hpreq_lock_match(struct ptlrpc_request *req, * Implementation of ptlrpc_hpreq_ops::hpreq_lock_check for OST_PUNCH request. * * High-priority queue request check for whether the given punch request - * (\a req) is blocking an LDLM lock cancel. + * (\a req) is blocking an LDLM lock cancel. Also checks whether the request is + * covered by an LDLM lock. + * + * * \param[in] req the incoming request * * \retval 1 if \a req is blocking an LDLM lock cancel * \retval 0 if it is not + * \retval -ESTALE if lock is not found */ static int ofd_punch_hpreq_check(struct ptlrpc_request *req) { struct tgt_session_info *tsi; struct obdo *oa; - int lock_count; + struct ldlm_prolong_args pa = { 0 }; ENTRY; @@ -2606,17 +2559,24 @@ static int ofd_punch_hpreq_check(struct ptlrpc_request *req) LASSERT(!(oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags & OBD_FL_SRVLOCK)); + pa.lpa_mode = LCK_PW | LCK_GROUP; + pa.lpa_extent.start = oa->o_size; + pa.lpa_extent.end = oa->o_blocks; + CDEBUG(D_DLMTRACE, "%s: refresh locks: "LPU64"/"LPU64" ("LPU64"->"LPU64")\n", tgt_name(tsi->tsi_tgt), tsi->tsi_resid.name[0], - tsi->tsi_resid.name[1], oa->o_size, oa->o_blocks); + tsi->tsi_resid.name[1], pa.lpa_extent.start, pa.lpa_extent.end); - lock_count = ofd_prolong_extent_locks(tsi, oa->o_size, oa->o_blocks); + ofd_prolong_extent_locks(tsi, &pa); CDEBUG(D_DLMTRACE, "%s: refreshed %u locks timeout for req %p.\n", - tgt_name(tsi->tsi_tgt), lock_count, req); + tgt_name(tsi->tsi_tgt), pa.lpa_blocks_cnt, req); + + if (pa.lpa_blocks_cnt > 0) + RETURN(1); - RETURN(lock_count > 0); + RETURN(pa.lpa_locks_cnt > 0 ? 0 : -ESTALE); } /** @@ -2667,7 +2627,9 @@ static void ofd_hp_brw(struct tgt_session_info *tsi) LASSERT(rnb != NULL); /* must exist after request preprocessing */ /* no high priority if server lock is needed */ - if (rnb->rnb_flags & OBD_BRW_SRVLOCK) + if (rnb->rnb_flags & OBD_BRW_SRVLOCK || + (lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) + & MSG_REPLAY)) return; } tgt_ses_req(tsi)->rq_ops = &ofd_hpreq_rw; @@ -2686,8 +2648,10 @@ static void ofd_hp_punch(struct tgt_session_info *tsi) { LASSERT(tsi->tsi_ost_body != NULL); /* must exists if we are here */ /* no high-priority if server lock is needed */ - if (tsi->tsi_ost_body->oa.o_valid & OBD_MD_FLFLAGS && - tsi->tsi_ost_body->oa.o_flags & OBD_FL_SRVLOCK) + if ((tsi->tsi_ost_body->oa.o_valid & OBD_MD_FLFLAGS && + tsi->tsi_ost_body->oa.o_flags & OBD_FL_SRVLOCK) || + tgt_conn_flags(tsi) & OBD_CONNECT_MDS || + lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY) return; tgt_ses_req(tsi)->rq_ops = &ofd_hpreq_punch; } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 5161287..4e9cf07 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1904,6 +1904,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%uw in flight", page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight); + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, 4); ptlrpcd_add_req(req); rc = 0; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 40077f0..e86c382 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -1583,7 +1583,14 @@ static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, * list otherwise it may hit swab race at LU-1044. */ if (req->rq_ops->hpreq_check != NULL) { rc = req->rq_ops->hpreq_check(req); - LASSERT(rc <= 1); /* can only return error, 0, or 1 */ + if (rc == -ESTALE) { + req->rq_status = rc; + ptlrpc_error(req); + } + /** can only return error, + * 0 for normal request, + * or 1 for high priority request */ + LASSERT(rc <= 1); } } diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 5d6a40d..0d2cfa7 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2496,6 +2496,31 @@ test_132() { } run_test 132 "long punch" +test_131() { + remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + + rm -f $DIR/$tfile + # get a lock on client so that export would reach the stale list + $SETSTRIPE -i 0 $DIR/$tfile || error "setstripe failed" + dd if=/dev/zero of=$DIR/$tfile count=1 || error "dd failed" + + # another IO under the same lock + #define OBD_FAIL_OSC_DELAY_IO 0x414 + $LCTL set_param fail_loc=0x80000414 + dd if=/dev/zero of=$DIR/$tfile count=1 conv=notrunc oflag=dsync & + local pid=$! + sleep 1 + + #define OBD_FAIL_LDLM_BL_EVICT 0x31e + set_nodes_failloc "$(osts_nodes)" 0x8000031e + ost_evict_client + client_reconnect + + wait $pid && error "dd succeeded" + return 0 +} +run_test 131 "IO vs evict results to IO under staled lock" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1