From 3209d206927d771d2ce7d71a1a75aecce4d0b356 Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Mon, 10 Jul 2023 18:49:50 -0400 Subject: [PATCH] LU-17003 dne: remove REP-ACK support in DNE system DNE system doesn't need to support REP-ACK. In the old implementation, write locks are kept in PW|EX mode after transaction stop, and will be downgraded to TXN mode till REP-ACK, and then not released until transaction commit. While in the period between transaction stop and REP-ACK, any read lock request will be on hold till downgrade, with this change, this read lock request will succeed immediately. During this period, any write lock request may involve extra commit, since mdt_blocking_ast() does not know whether transaction has stopped, so it needs to trigger commit-on-sharing immediately, and also set 'sync' flag in the lock. If transaction is not stopped yet, later when it's stopped, it will trigger another commit-on-sharing since the 'sync' flag is set. With this change, mdt_blocking_ast() only needs to set 'sync' flag if its mode is PW|EX, and trigger commit-on-sharing once upon unlock. This refuces the number of transaction commits and may improve performance in some corner cases. Signed-off-by: Lai Siyao Change-Id: I159a0ad619afd10e97be3dc175a6b4ed77b31142 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51851 Reviewed-by: Andreas Dilger Reviewed-by: Mikhail Pershin Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- lustre/include/lustre_net.h | 4 +--- lustre/ldlm/ldlm_lib.c | 2 -- lustre/ldlm/ldlm_lock.c | 29 +++++++++++++++++++----- lustre/mdt/mdt_handler.c | 51 +++++++++++++++++------------------------- lustre/mdt/mdt_internal.h | 1 + lustre/mdt/mdt_recovery.c | 3 +-- lustre/ptlrpc/layout.c | 1 - lustre/ptlrpc/service.c | 54 +-------------------------------------------- 8 files changed, 47 insertions(+), 98 deletions(-) diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 71cd242..f1ec613 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -668,8 +668,6 @@ struct ptlrpc_reply_state { unsigned long rs_committed:1;/* the transaction was committed and the rs was dispatched by ptlrpc_commit_replies */ - unsigned long rs_convert_lock:1; /* need to convert saved - * locks to COS mode */ atomic_t rs_refcount; /* number of users */ /** Number of locks awaiting client ACK */ int rs_nlocks; @@ -2236,7 +2234,7 @@ struct ptlrpc_service_conf { * @{ */ void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, - int mode, bool no_ack, bool convert_lock); + int mode, bool no_ack); void ptlrpc_commit_replies(struct obd_export *exp); void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index ee248fe..4c0d616 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -815,8 +815,6 @@ int server_disconnect_export(struct obd_export *exp) list_del_init(&rs->rs_exp_list); spin_lock(&rs->rs_lock); - /* clear rs_convert_lock to make sure rs is handled and put */ - rs->rs_convert_lock = 0; ptlrpc_schedule_difficult_reply(rs); spin_unlock(&rs->rs_lock); diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index b17fa40..8a8b4fe 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -2178,13 +2178,13 @@ ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) bld.bl_same_client = lock->l_client_cookie == lock->l_blocking_lock->l_client_cookie; /* if two locks are initiated from the same MDT, transactions are - * independent, or the request lock mode is CR|PR|CW, no need to trigger + * independent, or the request lock mode isn't EX|PW, no need to trigger * CoS because current lock will be downgraded to TXN mode soon, then * the blocking lock can be granted. */ if (lock->l_blocking_lock->l_policy_data.l_inodebits.li_initiator_id == lock->l_policy_data.l_inodebits.li_initiator_id || - lock->l_blocking_lock->l_req_mode & (LCK_CR | LCK_PR | LCK_CW)) + !(lock->l_blocking_lock->l_req_mode & (LCK_EX | LCK_PW))) bld.bl_txn_dependent = false; else bld.bl_txn_dependent = true; @@ -2898,8 +2898,26 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, break; case LDLM_IBITS: - libcfs_debug_msg(msgdata, - "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d initiator: MDT%d\n", + if (!lock->l_remote_handle.cookie) + libcfs_debug_msg(msgdata, + "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s flags: %#llx pid: %u initiator: MDT%d\n", + &vaf, + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + refcount_read(&lock->l_handle.h_ref), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + PLDLMRES(resource), + lock->l_policy_data.l_inodebits.bits, + lock->l_policy_data.l_inodebits.try_bits, + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, lock->l_pid, + lock->l_policy_data.l_inodebits.li_initiator_id); + else + libcfs_debug_msg(msgdata, + "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n", &vaf, ldlm_lock_to_ns_name(lock), lock, lock->l_handle.h_cookie, @@ -2917,8 +2935,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, lock->l_remote_handle.cookie, exp ? refcount_read(&exp->exp_handle.h_ref) : -99, lock->l_pid, lock->l_callback_timestamp, - lock->l_lvb_type, - lock->l_policy_data.l_inodebits.li_initiator_id); + lock->l_lvb_type); break; default: diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index e0841f9..f059339 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -3600,34 +3600,17 @@ int mdt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, * The 'data' parameter is l_ast_data in the first case and * callback arguments in the second one. Distinguish them by that. */ - if (!data || data == lock->l_ast_data || !arg->bl_desc) - goto skip_cos_checks; - - if (lock->l_req_mode & (LCK_PW | LCK_EX)) { - if (mdt_cos_is_enabled(mdt) && - !arg->bl_desc->bl_same_client) { - mdt_set_lock_sync(lock); - } else if (mdt_slc_is_enabled(mdt) && - arg->bl_desc->bl_txn_dependent) { - mdt_set_lock_sync(lock); - /* we may do extra commit here, because there is a small - * window to miss a commit: - * 1. lock was unlocked (saved), but not downgraded to - * TXN mode yet (REP-ACK not received). - * 2. a conflict lock enqueued and we come herej if we - * don't trigger commit now, the enqueued lock will wait - * untill system periodic commit. - * - * Fortunately this window is quite small, not to say - * distributed operation is rare too. - */ + if (data && data != lock->l_ast_data && arg->bl_desc) { + if (lock->l_req_mode & (LCK_COS | LCK_TXN)) commit_async = true; - } - } else if (lock->l_req_mode == LCK_COS || lock->l_req_mode == LCK_TXN) { - commit_async = true; + else if ((lock->l_req_mode & (LCK_PW | LCK_EX)) && + ((mdt_cos_is_enabled(mdt) && + !arg->bl_desc->bl_same_client) || + (mdt_slc_is_enabled(mdt) && + arg->bl_desc->bl_txn_dependent))) + mdt_set_lock_sync(lock); } -skip_cos_checks: rc = ldlm_blocking_ast_nocheck(lock); if (commit_async) { @@ -4163,8 +4146,7 @@ static void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h, struct mdt_device *mdt = info->mti_mdt; struct ldlm_lock *lock = ldlm_handle2lock(h); struct ptlrpc_request *req = mdt_info_req(info); - bool cos = mdt_cos_is_enabled(mdt); - bool convert_lock = !cos && mdt_slc_is_enabled(mdt); + bool no_ack = false; LASSERTF(lock != NULL, "no lock for cookie %#llx\n", h->cookie); @@ -4175,15 +4157,22 @@ static void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h, LDLM_DEBUG(lock, "save lock request %p reply " "state %p transno %lld\n", req, req->rq_reply_state, req->rq_transno); - if (cos) { - ldlm_lock_mode_downgrade(lock, LCK_COS); + if (mdt_cos_is_enabled(mdt)) { mode = LCK_COS; + no_ack = true; + ldlm_lock_mode_downgrade(lock, mode); + } else if (mdt_slc_is_enabled(mdt)) { + no_ack = true; + if (mode != LCK_TXN) { + mode = LCK_TXN; + ldlm_lock_mode_downgrade(lock, + mode); + } } if (req->rq_export->exp_disconnected) mdt_fid_unlock(h, mode); else - ptlrpc_save_lock(req, h, mode, cos, - convert_lock); + ptlrpc_save_lock(req, h, mode, no_ack); } else { mdt_fid_unlock(h, mode); } diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 58995ee..71fef0d 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -1279,6 +1279,7 @@ static inline void mdt_fid_unlock(struct lustre_handle *lh, enum ldlm_mode mode) ldlm_lock_decref(lh, mode); } +/* this is identical to say this is a DNE system */ static inline bool mdt_slc_is_enabled(struct mdt_device *mdt) { return mdt->mdt_lut.lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING; diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c index 760e029..2c1e2ac 100644 --- a/lustre/mdt/mdt_recovery.c +++ b/lustre/mdt/mdt_recovery.c @@ -103,8 +103,7 @@ static void mdt_steal_ack_locks(struct ptlrpc_request *req) spin_lock(&rs->rs_lock); for (i = 0; i < rs->rs_nlocks; i++) ptlrpc_save_lock(req, &rs->rs_locks[i], - rs->rs_modes[i], rs->rs_no_ack, - rs->rs_convert_lock); + rs->rs_modes[i], rs->rs_no_ack); rs->rs_nlocks = 0; DEBUG_REQ(D_HA, req, "stole locks for"); diff --git a/lustre/ptlrpc/layout.c b/lustre/ptlrpc/layout.c index d471fc3..506317d 100644 --- a/lustre/ptlrpc/layout.c +++ b/lustre/ptlrpc/layout.c @@ -2849,7 +2849,6 @@ int req_capsule_server_grow(struct req_capsule *pill, nrs->rs_difficult = 1; nrs->rs_no_ack = rs->rs_no_ack; - nrs->rs_convert_lock = rs->rs_convert_lock; for (i = 0; i < rs->rs_nlocks; i++) { nrs->rs_locks[i] = rs->rs_locks[i]; nrs->rs_modes[i] = rs->rs_modes[i]; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index fac60b5..cc33cb9 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -184,7 +184,7 @@ static int ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) * Puts a lock and its mode into reply state assotiated to request reply. */ void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, - int mode, bool no_ack, bool convert_lock) + int mode, bool no_ack) { struct ptlrpc_reply_state *rs = req->rq_reply_state; int idx; @@ -197,7 +197,6 @@ void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, rs->rs_modes[idx] = mode; rs->rs_difficult = 1; rs->rs_no_ack = no_ack; - rs->rs_convert_lock = convert_lock; } EXPORT_SYMBOL(ptlrpc_save_lock); @@ -2437,56 +2436,6 @@ static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) * rs_lock, which we do right next. */ if (!rs->rs_committed) { - /* - * if rs was commited, no need to convert locks, don't check - * rs_committed here because rs may never be added into - * exp_uncommitted_replies and this flag never be set, see - * target_send_reply() - */ - if (rs->rs_convert_lock && - rs->rs_transno > exp->exp_last_committed) { - struct ldlm_lock *lock; - struct ldlm_lock *ack_locks[RS_MAX_LOCKS] = { NULL }; - - spin_lock(&rs->rs_lock); - if (rs->rs_convert_lock && - rs->rs_transno > exp->exp_last_committed) { - nlocks = rs->rs_nlocks; - while (nlocks-- > 0) { - /* - * NB don't assume rs is always handled - * by the same service thread (see - * ptlrpc_hr_select, so REP-ACK hr may - * race with trans commit, while the - * latter will release locks, get locks - * here early to downgrade to TXN mode - * safely. - */ - lock = ldlm_handle2lock( - &rs->rs_locks[nlocks]); - LASSERT(lock); - ack_locks[nlocks] = lock; - rs->rs_modes[nlocks] = LCK_TXN; - } - nlocks = rs->rs_nlocks; - rs->rs_convert_lock = 0; - /* - * clear rs_scheduled so that commit callback - * can schedule again - */ - rs->rs_scheduled = 0; - spin_unlock(&rs->rs_lock); - - while (nlocks-- > 0) { - lock = ack_locks[nlocks]; - ldlm_lock_mode_downgrade(lock, LCK_TXN); - LDLM_LOCK_PUT(lock); - } - RETURN(0); - } - spin_unlock(&rs->rs_lock); - } - spin_lock(&exp->exp_uncommitted_replies_lock); list_del_init(&rs->rs_obd_list); spin_unlock(&exp->exp_uncommitted_replies_lock); @@ -2536,7 +2485,6 @@ static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) } rs->rs_scheduled = 0; - rs->rs_convert_lock = 0; if (rs->rs_unlinked) { /* Off the net */ -- 1.8.3.1