Whamcloud - gitweb
branch: HEAD
[fs/lustre-release.git] / lustre / ldlm / ldlm_request.c
index f2049ec..4e68bee 100644 (file)
 
 #include "ldlm_internal.h"
 
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+                "lock enqueue timeout minimum");
+
 static void interrupted_completion_wait(void *data)
 {
 }
@@ -65,7 +69,8 @@ int ldlm_expired_completion_wait(void *data)
                           CFS_DURATION_T"s ago); not entering recovery in "
                            "server code, just going back to sleep",
                           lock->l_enqueued_time.tv_sec,
-                           cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+                           cfs_time_current_sec() -
+                           lock->l_enqueued_time.tv_sec);
                 if (cfs_time_after(cfs_time_current(), next_dump)) {
                         last_dump = next_dump;
                         next_dump = cfs_time_shift(300);
@@ -89,6 +94,20 @@ int ldlm_expired_completion_wait(void *data)
         RETURN(0);
 }
 
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+        int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate);
+        if (AT_OFF)
+                return obd_timeout / 2;
+        /* Since these are non-updating timeouts, we should be conservative.
+           It would be nice to have some kind of "early reply" mechanism for
+           lock callbacks too... */
+        timeout = timeout + (timeout >> 1); /* 150% */
+        return max(timeout, ldlm_enqueue_min);
+}
+
 static int is_granted_or_cancelled(struct ldlm_lock *lock)
 {
         int ret = 0;
@@ -110,6 +129,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         struct obd_device *obd;
         struct obd_import *imp = NULL;
         struct l_wait_info lwi;
+        __u32 timeout;
         int rc = 0;
         ENTRY;
 
@@ -134,8 +154,14 @@ noreproc:
         obd = class_exp2obd(lock->l_conn_export);
 
         /* if this is a local lock, then there is no import */
-        if (obd != NULL)
+        if (obd != NULL) {
                 imp = obd->u.cli.cl_import;
+        }
+
+        /* Wait a long time for enqueue - server may have to callback a
+           lock from another client.  Server will evict the other client if it
+           doesn't respond reasonably, and then give us the lock. */
+        timeout = ldlm_get_enq_timeout(lock) * 2;
 
         lwd.lwd_lock = lock;
 
@@ -143,7 +169,7 @@ noreproc:
                 LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
                 lwi = LWI_INTR(interrupted_completion_wait, &lwd);
         } else {
-                lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+                lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
                                        ldlm_expired_completion_wait,
                                        interrupted_completion_wait, &lwd);
         }
@@ -168,7 +194,13 @@ noreproc:
                 RETURN(rc);
         }
 
-        LDLM_DEBUG(lock, "client-side enqueue waking up: granted");
+        LDLM_DEBUG(lock, "client-side enqueue waking up: granted after %lds",
+                   cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
+        /* Update our time estimate */
+        at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+               cfs_time_current_sec() - lock->l_enqueued_time.tv_sec);
+
         RETURN(0);
 }
 
@@ -921,6 +953,8 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
         LASSERT(exp != NULL);
         LASSERT(count > 0);
 
+        OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, obd_fail_val);
+
         if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
                 RETURN(count);
 
@@ -955,9 +989,9 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
                 req->rq_no_resend = 1;
                 req->rq_no_delay = 1;
 
-                /* XXX FIXME bug 249 */
                 req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
                 req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+                ptlrpc_at_set_req_timeout(req);
 
                 ldlm_cancel_pack(req, cancels, count);