Whamcloud - gitweb
Add an ldlm_timeout /proc tuneable during debugging of AST timeouts.
authoradilger <adilger>
Thu, 27 May 2004 13:48:30 +0000 (13:48 +0000)
committeradilger <adilger>
Thu, 27 May 2004 13:48:30 +0000 (13:48 +0000)
If ldlm_bl_to_thread() fails, call ldlm_handle_bl_callback() directly.
b=3267, b=3468

17 files changed:
lustre/configure.in
lustre/include/linux/obd_support.h
lustre/ldlm/l_lock.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/llite/super.c
lustre/llite/super25.c
lustre/lvfs/fsfilt_ext3.c
lustre/mds/handler.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/sysctl.c
lustre/osc/osc_request.c
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/service.c

index 4491587..5068a99 100644 (file)
@@ -5,7 +5,7 @@
 
 AC_INIT
 AC_CANONICAL_SYSTEM
-AM_INIT_AUTOMAKE(lustre, 1.2.1.14)
+AM_INIT_AUTOMAKE(lustre, 1.2.2)
 # AM_MAINTAINER_MODE
 
 # Four main targets: lustre kernel modules, utilities, tests, and liblustre
index 9d7957e..94ea609 100644 (file)
@@ -37,6 +37,7 @@ extern atomic_t obd_memory;
 extern int obd_memmax;
 extern unsigned int obd_fail_loc;
 extern unsigned int obd_timeout;
+extern unsigned int ldlm_timeout;
 extern char obd_lustre_upcall[128];
 extern unsigned int obd_sync_filter;
 extern wait_queue_head_t obd_race_waitq;
index 69d3f14..e07c25e 100644 (file)
@@ -85,8 +85,9 @@ void l_lock(struct lustre_lock *lock)
 
 void l_unlock(struct lustre_lock *lock)
 {
-        LASSERT(lock->l_owner == current);
-        LASSERT(lock->l_depth >= 0);
+        LASSERTF(lock->l_owner == current, "lock %p, current %p\n",
+                 lock->l_owner, current);
+        LASSERTF(lock->l_depth >= 0, "depth %d\n", lock->l_depth);
 
         spin_lock(&lock->l_spin);
         if (--lock->l_depth < 0) {
index ec55510..d4f2e8f 100644 (file)
@@ -478,13 +478,9 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
 
                 LDLM_LOCK_GET(lock); /* dropped by bl thread */
                 ldlm_lock_remove_from_lru(lock);
-#ifdef __KERNEL__
-                ldlm_bl_to_thread(ns, NULL, lock);
-                l_unlock(&ns->ns_lock);
-#else
                 l_unlock(&ns->ns_lock);
-                ldlm_handle_bl_callback(ns, NULL, lock);
-#endif
+                if (ldlm_bl_to_thread(ns, NULL, lock) != 0)
+                        ldlm_handle_bl_callback(ns, NULL, lock);
         } else if (ns->ns_client == LDLM_NAMESPACE_CLIENT &&
                    !lock->l_readers && !lock->l_writers) {
                 /* If this is a client-side namespace and this was the last
index a9aaf11..e87096c 100644 (file)
@@ -46,7 +46,7 @@ extern struct lustre_lock ldlm_handle_lock;
 extern struct list_head ldlm_namespace_list;
 
 static DECLARE_MUTEX(ldlm_ref_sem);
-static int ldlm_refcount = 0;
+static int ldlm_refcount;
 
 /* LDLM state */
 
@@ -374,7 +374,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
                 /* this blocking AST will be communicated as part of the
                  * completion AST instead */
                 l_unlock(&lock->l_resource->lr_namespace->ns_lock);
-                LDLM_DEBUG(lock, "lock not granted, not sending blocking AST");                 RETURN(0);
+                LDLM_DEBUG(lock, "lock not granted, not sending blocking AST");
+                RETURN(0);
         }
 
         if (lock->l_destroyed) {
@@ -412,7 +413,7 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
 
         req->rq_send_state = LUSTRE_IMP_FULL;
-        req->rq_timeout = 2; /* 2 second timeout for initial AST reply */
+        req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */
         rc = ptlrpc_queue_wait(req);
         if (rc != 0)
                 rc = ldlm_handle_ast_error(lock, req, rc, "blocking");
@@ -474,7 +475,7 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         req->rq_replen = lustre_msg_size(0, NULL);
 
         req->rq_send_state = LUSTRE_IMP_FULL;
-        req->rq_timeout = 2; /* 2 second timeout for initial AST reply */
+        req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */
 
         /* We only send real blocking ASTs after the lock is granted */
         l_lock(&lock->l_resource->lr_namespace->ns_lock);
@@ -517,7 +518,7 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data)
         req->rq_replen = lustre_msg_size(1, &size);
 
         req->rq_send_state = LUSTRE_IMP_FULL;
-        req->rq_timeout = 2; /* 2 second timeout for initial AST reply */
+        req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */
 
         rc = ptlrpc_queue_wait(req);
         if (rc == -ELDLM_NO_LOCK_DATA)
@@ -889,16 +890,16 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
                 ptlrpc_error(req);
         }
 
+        l_unlock(&ns->ns_lock);
         if (lock->l_granted_mode == LCK_PW &&
             !lock->l_readers && !lock->l_writers &&
             time_after(jiffies, lock->l_last_used + 10 * HZ)) {
-                l_unlock(&ns->ns_lock);
-                ldlm_handle_bl_callback(ns, NULL, lock);
+                if (ldlm_bl_to_thread(ns, NULL, lock))
+                        ldlm_handle_bl_callback(ns, NULL, lock);
                 EXIT;
                 return;
         }
 
-        l_unlock(&ns->ns_lock);
         LDLM_LOCK_PUT(lock);
         EXIT;
 }
@@ -935,11 +936,11 @@ int ldlm_bl_to_thread(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
         list_add_tail(&blwi->blwi_entry, &blp->blp_list);
         wake_up(&blp->blp_waitq);
         spin_unlock(&blp->blp_lock);
-#else
-        LBUG();
-#endif
 
         RETURN(0);
+#else
+        RETURN(-ENOSYS);
+#endif
 }
 
 static int ldlm_callback_handler(struct ptlrpc_request *req)
@@ -1056,14 +1057,9 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
         switch (req->rq_reqmsg->opc) {
         case LDLM_BL_CALLBACK:
                 CDEBUG(D_INODE, "blocking ast\n");
-#ifdef __KERNEL__
-                rc = ldlm_bl_to_thread(ns, &dlm_req->lock_desc, lock);
-                ldlm_callback_reply(req, rc);
-#else
-                rc = 0;
-                ldlm_callback_reply(req, rc);
-                ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
-#endif
+                ldlm_callback_reply(req, 0);
+                if (ldlm_bl_to_thread(ns, &dlm_req->lock_desc, lock))
+                        ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
                 break;
         case LDLM_CP_CALLBACK:
                 CDEBUG(D_INODE, "completion ast\n");
@@ -1416,10 +1412,10 @@ void __exit ldlm_exit(void)
 {
         if ( ldlm_refcount )
                 CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
-        if (kmem_cache_destroy(ldlm_resource_slab) != 0)
-                CERROR("couldn't free ldlm resource slab\n");
-        if (kmem_cache_destroy(ldlm_lock_slab) != 0)
-                CERROR("couldn't free ldlm lock slab\n");
+        LASSERTF(kmem_cache_destroy(ldlm_resource_slab) == 0,
+                 "couldn't free ldlm resource slab\n");
+        LASSERTF(kmem_cache_destroy(ldlm_lock_slab) == 0,
+                 "couldn't free ldlm lock slab\n");
 }
 
 /* ldlm_flock.c */
index 9cdad73..d6e5359 100644 (file)
@@ -618,9 +618,7 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, ldlm_sync_t sync)
 
                 LDLM_LOCK_GET(lock); /* dropped by bl thread */
                 ldlm_lock_remove_from_lru(lock);
-                if (sync == LDLM_ASYNC)
-                        ldlm_bl_to_thread(ns, NULL, lock);
-                else
+                if (sync != LDLM_ASYNC || ldlm_bl_to_thread(ns, NULL, lock))
                         list_add(&lock->l_lru, &cblist);
 
                 if (--count == 0)
index 133407a..45fe591 100644 (file)
@@ -114,7 +114,8 @@ static void __exit exit_lustre_lite(void)
         unregister_filesystem(&lustre_lite_fs_type);
         unregister_filesystem(&lustre_fs_type);
 
-        kmem_cache_destroy(ll_file_data_slab);
+        LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0,
+                 "couldn't destroy ll_file_data slab\n");
 
         if (proc_lustre_fs_root) {
                 lprocfs_remove(proc_lustre_fs_root);
index 1f5425e..8df4dea 100644 (file)
@@ -93,8 +93,8 @@ int ll_init_inodecache(void)
 
 void ll_destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(ll_inode_cachep))
-                CERROR("ll_inode_cache: not all structures were freed\n");
+        LASSERTF(kmem_cache_destroy(ll_inode_cachep) == 0,
+                 "ll_inode_cache: not all structures were freed\n");
 }
 
 /* exported operations */
@@ -158,7 +158,8 @@ static void __exit exit_lustre_lite(void)
         unregister_filesystem(&lustre_fs_type);
         unregister_filesystem(&lustre_lite_fs_type);
         ll_destroy_inodecache();
-        kmem_cache_destroy(ll_file_data_slab);
+        LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0,
+                 "couldn't destroy ll_file_data slab\n");
         if (proc_lustre_fs_root) {
                 lprocfs_remove(proc_lustre_fs_root);
                 proc_lustre_fs_root = NULL;
index 811d50f..d46bd23 100644 (file)
@@ -936,15 +936,10 @@ out:
 
 static void __exit fsfilt_ext3_exit(void)
 {
-        int rc;
-
         fsfilt_unregister_ops(&fsfilt_ext3_ops);
-        rc = kmem_cache_destroy(fcb_cache);
-
-        if (rc || atomic_read(&fcb_cache_count)) {
-                CERROR("can't free fsfilt callback cache: count %d, rc = %d\n",
-                       atomic_read(&fcb_cache_count), rc);
-        }
+        LASSERTF(kmem_cache_destroy(fcb_cache) == 0,
+                 "can't free fsfilt callback cache: count %d\n",
+                 atomic_read(&fcb_cache_count));
 
         //rc = ext3_xattr_unregister();
 }
index 104c009..43865fe 100644 (file)
@@ -806,11 +806,11 @@ static int mds_getattr(int offset, struct ptlrpc_request *req)
         int rc = 0;
         ENTRY;
 
-        body = lustre_swab_reqbuf (req, offset, sizeof (*body),
-                                   lustre_swab_mds_body);
+        body = lustre_swab_reqbuf(req, offset, sizeof(*body),
+                                  lustre_swab_mds_body);
         if (body == NULL) {
-                CERROR ("Can't unpack body\n");
-                RETURN (-EFAULT);
+                CERROR("Can't unpack body\n");
+                RETURN(-EFAULT);
         }
 
         uc.ouc_fsuid = body->fsuid;
@@ -819,14 +819,14 @@ static int mds_getattr(int offset, struct ptlrpc_request *req)
         push_ctxt(&saved, &obd->obd_ctxt, &uc);
         de = mds_fid2dentry(mds, &body->fid1, NULL);
         if (IS_ERR(de)) {
-                rc = req->rq_status = -ENOENT;
-                GOTO(out_pop, PTR_ERR(de));
+                rc = req->rq_status = PTR_ERR(de);
+                GOTO(out_pop, rc);
         }
 
         rc = mds_getattr_pack_msg(req, de->d_inode, offset);
         if (rc != 0) {
-                CERROR ("mds_getattr_pack_msg: %d\n", rc);
-                GOTO (out_pop, rc);
+                CERROR("mds_getattr_pack_msg: %d\n", rc);
+                GOTO(out_pop, rc);
         }
 
         req->rq_status = mds_getattr_internal(obd, de, req, body, 0);
index 2ea2a9a..e76317f 100644 (file)
@@ -88,6 +88,7 @@ int proc_version;
 /* The following are visible and mutable through /proc/sys/lustre/. */
 unsigned int obd_fail_loc;
 unsigned int obd_timeout = 100;
+unsigned int ldlm_timeout = 6;
 char obd_lustre_upcall[128] = "DEFAULT"; /* or NONE or /full/path/to/upcall  */
 unsigned int obd_sync_filter; /* = 0, don't sync by default */
 
@@ -383,6 +384,7 @@ EXPORT_SYMBOL(obdo_cachep);
 EXPORT_SYMBOL(obd_fail_loc);
 EXPORT_SYMBOL(obd_race_waitq);
 EXPORT_SYMBOL(obd_timeout);
+EXPORT_SYMBOL(ldlm_timeout);
 EXPORT_SYMBOL(obd_lustre_upcall);
 EXPORT_SYMBOL(obd_sync_filter);
 EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
index aee57a7..a8db9cb 100644 (file)
@@ -273,13 +273,13 @@ struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
 struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
 {
         int i;
-        if (next == NULL) 
+        if (next == NULL)
                 i = 0;
         else if (*next >= 0 && *next < MAX_OBD_DEVICES)
                 i = *next;
-        else 
+        else
                 return NULL;
-                
+
         for (; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
                 if (obd->obd_type == NULL)
@@ -297,18 +297,15 @@ struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
 
 void obd_cleanup_caches(void)
 {
-        int rc;
         ENTRY;
         if (obdo_cachep) {
-                rc = kmem_cache_destroy(obdo_cachep);
-                if (rc)
-                        CERROR("Cannot destory ll_obdo_cache\n");
+                LASSERTF(kmem_cache_destroy(obdo_cachep) == 0,
+                         "Cannot destory ll_obdo_cache\n");
                 obdo_cachep = NULL;
         }
         if (import_cachep) {
-                rc = kmem_cache_destroy(import_cachep);
-                if (rc)
-                        CERROR("Cannot destory ll_import_cache\n");
+                LASSERTF(kmem_cache_destroy(import_cachep) == 0,
+                         "Cannot destory ll_import_cache\n");
                 import_cachep = NULL;
         }
         EXIT;
index 8c93a48..9937ccc 100644 (file)
@@ -52,6 +52,7 @@ enum {
         OBD_TIMEOUT,            /* RPC timeout before recovery/intr */
         OBD_UPCALL,             /* path to recovery upcall */
         OBD_SYNCFILTER,         /* XXX temporary, as we play with sync osts.. */
+        OBD_LDLM_TIMEOUT,       /* LDLM timeout for ASTs before client eviction */
 };
 
 int proc_fail_loc(ctl_table *table, int write, struct file *filp,
@@ -67,6 +68,8 @@ static ctl_table obd_table[] = {
                 &proc_dostring, &sysctl_string },
         {OBD_SYNCFILTER, "filter_sync_on_commit", &obd_sync_filter, sizeof(int),
                 0644, NULL, &proc_dointvec},
+        {OBD_TIMEOUT, "ldlm_timeout", &ldlm_timeout, sizeof(int), 0644, NULL,
+                &proc_dointvec},
         { 0 }
 };
 
index 66a06d2..ded86b3 100644 (file)
@@ -2342,7 +2342,10 @@ static void osc_set_data_with_check(struct lustre_handle *lockh, void *data)
 {
         struct ldlm_lock *lock = ldlm_handle2lock(lockh);
 
-        LASSERTF(lock != NULL, "lockh %p, data %p\n", lockh, data);
+        if (lock == NULL) {
+                CERROR("lockh %p, data %p - client evicted?\n", lockh, data);
+                return;
+        }
         l_lock(&lock->l_resource->lr_namespace->ns_lock);
 #ifdef __KERNEL__
         if (lock->l_ast_data && lock->l_ast_data != data) {
@@ -2466,7 +2469,7 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
         rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type,
                              policy, mode, lockh);
         if (rc) {
-                if (!(*flags & LDLM_FL_TEST_LOCK))
+                //if (!(*flags & LDLM_FL_TEST_LOCK))
                         osc_set_data_with_check(lockh, data);
                 RETURN(rc);
         }
index 7ef963d..5dc783d 100644 (file)
@@ -821,7 +821,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req)
         struct obd_import *imp = req->rq_import;
         ENTRY;
 
-        DEBUG_REQ(D_ERROR, req, "timeout");
+        DEBUG_REQ(D_ERROR, req, "timeout (sent %lu)", (long)req->rq_sent);
 
         spin_lock_irqsave (&req->rq_lock, flags);
         req->rq_timedout = 1;
@@ -949,7 +949,6 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
         }
         RETURN(timeout);
 }
-                
 
 int ptlrpc_set_wait(struct ptlrpc_request_set *set)
 {
@@ -1410,7 +1409,7 @@ restart:
                 timeout = 1;
         } else {
                 timeout = MAX(req->rq_timeout * HZ, 1);
-                DEBUG_REQ(D_NET, req, "-- sleeping");
+                DEBUG_REQ(D_NET, req, "-- sleeping for %d jiffies", timeout);
         }
         lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
                                req);
index 3b366b3..f702d2d 100644 (file)
@@ -157,7 +157,7 @@ void request_in_callback(ptl_event_t *ev)
         struct ptlrpc_srv_ni              *srv_ni = rqbd->rqbd_srv_ni;
         struct ptlrpc_service             *service = srv_ni->sni_service;
         struct ptlrpc_request             *req;
-        long                               flags;
+        unsigned long                     flags;
         ENTRY;
 
         LASSERT (ev->type == PTL_EVENT_PUT ||
index dae2e5d..4a13771 100644 (file)
@@ -418,8 +418,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
          * client's timeout is similar to mine, she'll be timing out this
          * REQ anyway (bug 1502) */
         if (timediff / 1000000 > (long)obd_timeout) {
-                CERROR("Dropping timed-out request from "LPX64
-                       ": %ld seconds old\n",
+                CERROR("Dropping timed-out opc %d request from "LPX64
+                       ": %ld seconds old\n", request->rq_reqmsg->opc,
                        request->rq_peer.peer_nid, timediff / 1000000);
                 goto out;
         }