branch: HEAD

[fs/lustre-release.git] / lustre / ldlm / ldlm_request.c
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index f2049ec..4e68bee 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -34,6 +34,10 @@
  
  #include "ldlm_internal.h"
  
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+                "lock enqueue timeout minimum");
+
  static void interrupted_completion_wait(void *data)
  {
  }
@@ -65,7 +69,8 @@ int ldlm_expired_completion_wait(void *data)
                            CFS_DURATION_T"s ago); not entering recovery in "
                             "server code, just going back to sleep",
                            lock->l_enqueued_time.tv_sec,
-                           cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+                           cfs_time_current_sec() -
+                           lock->l_enqueued_time.tv_sec);
                  if (cfs_time_after(cfs_time_current(), next_dump)) {
                          last_dump = next_dump;
                          next_dump = cfs_time_shift(300);
@@ -89,6 +94,20 @@ int ldlm_expired_completion_wait(void *data)
          RETURN(0);
  }
  
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+        int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate);
+        if (AT_OFF)
+                return obd_timeout / 2;
+        /* Since these are non-updating timeouts, we should be conservative.
+           It would be nice to have some kind of "early reply" mechanism for
+           lock callbacks too... */
+        timeout = timeout + (timeout >> 1); /* 150% */
+        return max(timeout, ldlm_enqueue_min);
+}
+
  static int is_granted_or_cancelled(struct ldlm_lock *lock)
  {
          int ret = 0;
@@ -110,6 +129,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
          struct obd_device *obd;
          struct obd_import *imp = NULL;
          struct l_wait_info lwi;
+        __u32 timeout;
          int rc = 0;
          ENTRY;
  
@@ -134,8 +154,14 @@ noreproc:
          obd = class_exp2obd(lock->l_conn_export);
  
          /* if this is a local lock, then there is no import */
-        if (obd != NULL)
+        if (obd != NULL) {
                  imp = obd->u.cli.cl_import;
+        }
+
+        /* Wait a long time for enqueue - server may have to callback a
+           lock from another client.  Server will evict the other client if it
+           doesn't respond reasonably, and then give us the lock. */
+        timeout = ldlm_get_enq_timeout(lock) * 2;
  
          lwd.lwd_lock = lock;
  
@@ -143,7 +169,7 @@ noreproc:
                  LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
                  lwi = LWI_INTR(interrupted_completion_wait, &lwd);
          } else {
-                lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+                lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
                                         ldlm_expired_completion_wait,
                                         interrupted_completion_wait, &lwd);
          }
@@ -168,7 +194,13 @@ noreproc:
                  RETURN(rc);
          }
  
-        LDLM_DEBUG(lock, "client-side enqueue waking up: granted");
+        LDLM_DEBUG(lock, "client-side enqueue waking up: granted after %lds",
+                   cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
+        /* Update our time estimate */
+        at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+               cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
          RETURN(0);
  }
  
@@ -921,6 +953,8 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
          LASSERT(exp != NULL);
          LASSERT(count > 0);
  
+        OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, obd_fail_val);
+
          if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
                  RETURN(count);
  
@@ -955,9 +989,9 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
                  req->rq_no_resend = 1;
                  req->rq_no_delay = 1;
  
-                /* XXX FIXME bug 249 */
                  req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
                  req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+                ptlrpc_at_set_req_timeout(req);
  
                  ldlm_cancel_pack(req, cancels, count);