From af07c9a79e263f940fea06a911803097b57b55f4 Mon Sep 17 00:00:00 2001 From: Vitaly Fertman Date: Fri, 4 Dec 2020 20:22:55 +0300 Subject: [PATCH] LU-14183 ldlm: wrong ldlm_add_waiting_lock usage exp_bl_lock_at accounted the period since BLAST send until cancel RPC came to server originally. LU-6032 started to update l_blast_sent for expired locks which are still busy - prolonged locks when the timeout expired. In fact, this is a good idea to cover not the whole period but until any involved RPC comes - it avoids excessively large lock callback timeouts - and the IO which does the lock prolong is also able to re-start the AT cycle by updating the l_blast_sent. Unfortunately, the change seems to be made occasionally as the main prolong code was not adjusted accordingly. Fixes: 292aa42e08 ("LU-6032 ldlm: don't disable softirq for exp_rpc_lock") HPE-bug-id: LUS-9278 Signed-off-by: Vitaly Fertman Change-Id: Idc598508fc13aa33ac9fce56f13310ca6fc819d4 Tested-by: Jenkins Build User Reviewed-by: Andriy Skulysh Reviewed-by: Alexander Boyko Reviewed-on: https://review.whamcloud.com/40868 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexander Boyko Reviewed-by: Andriy Skulysh Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_extent.c | 2 -- lustre/ldlm/ldlm_lockd.c | 16 ++++++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index d557f49..37aa87e 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -657,8 +657,6 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock, */ timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1); - LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout); - arg->lpa_blocks_cnt++; /* OK. this is a possible lock the user holds doing I/O diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index cbf9479..52930be 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -255,7 +255,7 @@ static int expired_lock_main(void *arg) LDLM_ERROR(lock, "lock callback timer expired after %llds: evicting client at %s ", - ktime_get_real_seconds() - + ktime_get_seconds() - lock->l_blast_sent, obd_export_nid2str(export)); ldlm_lock_to_ns(lock)->ns_timeouts++; @@ -377,10 +377,10 @@ static void waiting_locks_callback(TIMER_DATA_TYPE unused) static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t delay) { unsigned long timeout_jiffies = jiffies; - time64_t now = ktime_get_seconds(); time64_t deadline; timeout_t timeout; + lock->l_blast_sent = ktime_get_seconds(); if (!list_empty(&lock->l_pending_chain)) return 0; @@ -388,11 +388,12 @@ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t delay) OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) delay = 1; - deadline = now + delay; + deadline = lock->l_blast_sent + delay; if (likely(deadline > lock->l_callback_timestamp)) lock->l_callback_timestamp = deadline; - timeout = clamp_t(timeout_t, lock->l_callback_timestamp - now, + timeout = clamp_t(timeout_t, + lock->l_callback_timestamp - lock->l_blast_sent, 0, delay); timeout_jiffies += cfs_time_seconds(timeout); @@ -468,7 +469,6 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout) } ldlm_set_waited(lock); - lock->l_blast_sent = ktime_get_real_seconds(); ret = __ldlm_add_waiting_lock(lock, timeout); if (ret) { /* @@ -596,7 +596,7 @@ int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout) __ldlm_add_waiting_lock(lock, timeout); spin_unlock_bh(&waiting_locks_spinlock); - LDLM_DEBUG(lock, "refreshed"); + LDLM_DEBUG(lock, "refreshed to %ds", timeout); return 1; } EXPORT_SYMBOL(ldlm_refresh_waiting_lock); @@ -1752,8 +1752,8 @@ int ldlm_request_cancel(struct ptlrpc_request *req, lock->l_blast_sent != 0) { timeout_t delay = 0; - if (ktime_get_real_seconds() > lock->l_blast_sent) - delay = ktime_get_real_seconds() - + if (ktime_get_seconds() > lock->l_blast_sent) + delay = ktime_get_seconds() - lock->l_blast_sent; LDLM_DEBUG(lock, "server cancels blocked lock after %ds", -- 1.8.3.1