b=18618

author nathan <nathan>

Tue, 9 Jun 2009 20:43:41 +0000 (20:43 +0000)

committer nathan <nathan>

Tue, 9 Jun 2009 20:43:41 +0000 (20:43 +0000)
author nathan <nathan>
Tue, 9 Jun 2009 20:43:41 +0000 (20:43 +0000)
committer nathan <nathan>
Tue, 9 Jun 2009 20:43:41 +0000 (20:43 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index 29fa414..e44e759 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -15,6 +15,14 @@ tbd Sun Microsystems, Inc.
           more information, please refer to bugzilla 17630.
  
  Severity   : normal
+Frequency  : rare, if clients are evicted due to failure to return a lock
+Bugzilla   : 18618
+Description: don't increase ldlm timeout if previous client was evicted
+Details    : if a client doesn't respond to a BAST within the adaptive ldlm
+            enqueue timeout, don't adjust the adaptive estimate when the lock
+            is next granted.
+       
+Severity   : normal
  Bugzilla   : 18674
  Description: abort bulk too early if client is reconnected
  
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index 3c06d8f..95e37fb 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -802,11 +802,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
          total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(),
                                            lock->l_last_activity);
  
-        if (total_enqueue_wait > obd_timeout)
-                /* non-fatal with AT - change to LDLM_DEBUG? */
-                LDLM_WARN(lock, "enqueue wait took %lus from %lu",
-                          total_enqueue_wait, lock->l_last_activity);
-
          lock_res_and_lock(lock);
          if (lock->l_resource->lr_lvb_len) {
                  size[DLM_REQ_REC_OFF] = lock->l_resource->lr_lvb_len;
@@ -846,8 +841,18 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
  
          /* Server-side enqueue wait time estimate, used in
              __ldlm_add_waiting_lock to set future enqueue timers */
-        at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
-               total_enqueue_wait);
+        if (total_enqueue_wait < ldlm_get_enq_timeout(lock))
+                at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+                       total_enqueue_wait);
+        else
+                /* bz18618. Don't add lock enqueue time we spend waiting for a
+                   previous callback to fail. Locks waiting legitimately will
+                   get extended by ldlm_refresh_waiting_lock regardless of the
+                   estimate, so it's okay to underestimate here. */
+                LDLM_DEBUG(lock, "lock completed after %lus; estimate was %ds. "
+                       "It is likely that a previous callback timed out.",
+                       total_enqueue_wait,
+                       at_get(&lock->l_resource->lr_namespace->ns_at_estimate));
  
          ptlrpc_req_set_repsize(req, 1, NULL);
  
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index cd85edd..3621d9d 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -110,7 +110,7 @@ int ldlm_get_enq_timeout(struct ldlm_lock *lock)
          /* Since these are non-updating timeouts, we should be conservative.
             It would be nice to have some kind of "early reply" mechanism for
             lock callbacks too... */
-        timeout = timeout + (timeout >> 1); /* 150% */
+        timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
          return max(timeout, ldlm_enqueue_min);
  }
  EXPORT_SYMBOL(ldlm_get_enq_timeout);
author	nathan <nathan>
	Tue, 9 Jun 2009 20:43:41 +0000 (20:43 +0000)
committer	nathan <nathan>
	Tue, 9 Jun 2009 20:43:41 +0000 (20:43 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/ldlm/ldlm_request.c		patch \| blob \| history