Whamcloud - gitweb
LU-14183 ldlm: wrong ldlm_add_waiting_lock usage
authorVitaly Fertman <c17818@cray.com>
Tue, 4 Oct 2022 17:24:31 +0000 (10:24 -0700)
committerAndreas Dilger <adilger@whamcloud.com>
Tue, 11 Oct 2022 07:47:48 +0000 (07:47 +0000)
exp_bl_lock_at accounted the period since BLAST send until cancel RPC
came to server originally. LU-6032 started to update l_blast_sent for
expired locks which are still busy - prolonged locks when the timeout
expired. In fact, this is a good idea to cover not the whole period
but until any involved RPC comes - it avoids excessively large lock
callback timeouts - and the IO which does the lock prolong is also
able to re-start the AT cycle by updating the l_blast_sent.

Unfortunately, the change seems to be made occasionally as the main
prolong code was not adjusted accordingly.

Lustre-change: https://review.whamcloud.com/40868
Lustre-commit: af07c9a79e263f940fea06a911803097b57b55f4

Fixes: 292aa42e08 ("LU-6032 ldlm: don't disable softirq for exp_rpc_lock")
HPE-bug-id: LUS-9278
Signed-off-by: Vitaly Fertman <c17818@cray.com>
Change-Id: Idc598508fc13aa33ac9fce56f13310ca6fc819d4
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/48761
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/ldlm/ldlm_extent.c
lustre/ldlm/ldlm_lockd.c

index 56a8c73..b350708 100644 (file)
@@ -657,8 +657,6 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
         */
        timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1);
 
-       LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout);
-
        arg->lpa_blocks_cnt++;
 
        /* OK. this is a possible lock the user holds doing I/O
index 212a3a0..52c69e0 100644 (file)
@@ -257,7 +257,7 @@ static int expired_lock_main(void *arg)
 
                                LDLM_ERROR(lock,
                                           "lock callback timer expired after %llds: evicting client at %s ",
-                                          ktime_get_real_seconds() -
+                                          ktime_get_seconds() -
                                           lock->l_blast_sent,
                                           obd_export_nid2str(export));
                                ldlm_lock_to_ns(lock)->ns_timeouts++;
@@ -379,10 +379,10 @@ static void waiting_locks_callback(TIMER_DATA_TYPE unused)
 static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t delay)
 {
        unsigned long timeout_jiffies = jiffies;
-       time64_t now = ktime_get_seconds();
        time64_t deadline;
        timeout_t timeout;
 
+       lock->l_blast_sent = ktime_get_seconds();
        if (!list_empty(&lock->l_pending_chain))
                return 0;
 
@@ -390,11 +390,12 @@ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t delay)
            OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
                delay = 1;
 
-       deadline = now + delay;
+       deadline = lock->l_blast_sent + delay;
        if (likely(deadline > lock->l_callback_timestamp))
                lock->l_callback_timestamp = deadline;
 
-       timeout = clamp_t(timeout_t, lock->l_callback_timestamp - now,
+       timeout = clamp_t(timeout_t,
+                         lock->l_callback_timestamp - lock->l_blast_sent,
                          0, delay);
        timeout_jiffies += cfs_time_seconds(timeout);
 
@@ -470,7 +471,6 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock, timeout_t timeout)
        }
 
        ldlm_set_waited(lock);
-       lock->l_blast_sent = ktime_get_real_seconds();
        ret = __ldlm_add_waiting_lock(lock, timeout);
        if (ret) {
                /*
@@ -598,7 +598,7 @@ int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout)
        __ldlm_add_waiting_lock(lock, timeout);
        spin_unlock_bh(&waiting_locks_spinlock);
 
-       LDLM_DEBUG(lock, "refreshed");
+       LDLM_DEBUG(lock, "refreshed to %ds", timeout);
        return 1;
 }
 EXPORT_SYMBOL(ldlm_refresh_waiting_lock);
@@ -1754,8 +1754,8 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
                    lock->l_blast_sent != 0) {
                        timeout_t delay = 0;
 
-                       if (ktime_get_real_seconds() > lock->l_blast_sent)
-                               delay = ktime_get_real_seconds() -
+                       if (ktime_get_seconds() > lock->l_blast_sent)
+                               delay = ktime_get_seconds() -
                                        lock->l_blast_sent;
                        LDLM_DEBUG(lock,
                                   "server cancels blocked lock after %ds",