Whamcloud - gitweb
Branch: b1_4
authoradilger <adilger>
Tue, 29 Mar 2005 13:03:33 +0000 (13:03 +0000)
committeradilger <adilger>
Tue, 29 Mar 2005 13:03:33 +0000 (13:03 +0000)
Don't re-add a lock being destroyed to the waiting locks list.
Debugging (that doesn't panic system) in case this happens again.
b=5653
r=phil

lustre/ChangeLog
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c

index b8b68ed..1121eb6 100644 (file)
@@ -7,6 +7,7 @@
        - swab llog records of type '0' so we get proper header size/idx (5861)
        - send llog cancel req to DLM cancel portal instead of cb portal (5515)
         - avoid SetPageDirty on 2.6 (5981)
+       - don't re-add just-being-destroyed locks to the waiting list (5653)
        * miscellania
        - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs
        - allow --write-conf on an MDS with different nettype than client (5619)
index be4bf58..9e57724 100644 (file)
@@ -684,7 +684,7 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                         struct l_wait_info lwi;
                         if (lock->l_completion_ast) {
                                 int err = lock->l_completion_ast(lock,
-                                                           LDLM_FL_WAIT_NOREPROC,
+                                                          LDLM_FL_WAIT_NOREPROC,
                                                                  NULL);
                                 if (err) {
                                         rc = 0;
@@ -708,7 +708,7 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                            type == LDLM_PLAIN ? res_id->name[3] :
                                 policy->l_extent.end);
                 l_unlock(&ns->ns_lock);
-        } else if (!(flags & LDLM_FL_TEST_LOCK)) { /*less verbose for test-only*/
+        } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
                 LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
                                   LPU64"/"LPU64" ("LPU64" "LPU64")", ns,
                                   type, mode, res_id->name[0], res_id->name[1],
index f5616ad..eb4565d 100644 (file)
@@ -66,6 +66,7 @@ static struct timer_list waiting_locks_timer;
 static struct expired_lock_thread {
         wait_queue_head_t         elt_waitq;
         int                       elt_state;
+        int                       elt_dump;
         struct list_head          elt_expired_locks;
 } expired_lock_thread;
 #endif
@@ -129,6 +130,18 @@ static int expired_lock_main(void *arg)
                              &lwi);
 
                 spin_lock_bh(&waiting_locks_spinlock);
+                if (expired_lock_thread.elt_dump) {
+                        spin_unlock_bh(&waiting_locks_spinlock);
+
+                        /* from waiting_locks_callback, but not in timer */
+                        portals_debug_dumplog();
+                        portals_run_lbug_upcall(__FILE__, "waiting_locks_cb",
+                                                expired_lock_thread.elt_dump);
+
+                        spin_lock_bh(&waiting_locks_spinlock);
+                        expired_lock_thread.elt_dump = 0;
+                }
+
                 while (!list_empty(expired)) {
                         struct obd_export *export;
                         struct ldlm_lock *lock;
@@ -168,6 +181,7 @@ static int expired_lock_main(void *arg)
         RETURN(0);
 }
 
+/* This is called from within a timer interrupt and cannot schedule */
 static void waiting_locks_callback(unsigned long unused)
 {
         struct ldlm_lock *lock, *last = NULL;
@@ -194,8 +208,17 @@ static void waiting_locks_callback(unsigned long unused)
                                &lock->l_pending_chain,
                                lock->l_pending_chain.next,
                                lock->l_pending_chain.prev);
+
+                        INIT_LIST_HEAD(&waiting_locks_list);    /* HACK */
+                        expired_lock_thread.elt_dump = __LINE__;
                         spin_unlock_bh(&waiting_locks_spinlock);
-                        LBUG();
+
+                        /* LBUG(); */
+                        CEMERG("would be an LBUG, but isn't (bug 5653)\n");
+                        portals_debug_dumpstack(NULL);
+                        /*blocks* portals_debug_dumplog(); */
+                        /*blocks* portals_run_lbug_upcall(file, func, line); */
+                        break;
                 }
                 last = lock;
 
@@ -235,6 +258,17 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
         l_check_ns_lock(lock->l_resource->lr_namespace);
 
         spin_lock_bh(&waiting_locks_spinlock);
+        if (lock->l_destroyed) {
+                static unsigned long next;
+                spin_unlock_bh(&waiting_locks_spinlock);
+                LDLM_ERROR(lock, "not waiting on destroyed lock (bug 5653)");
+                if (time_after(jiffies, next)) {
+                        next = jiffies + 14400 * HZ;
+                        portals_debug_dumpstack(NULL);
+                }
+                return 0;
+        }
+
         if (!list_empty(&lock->l_pending_chain)) {
                 spin_unlock_bh(&waiting_locks_spinlock);
                 LDLM_DEBUG(lock, "not re-adding to wait list");