Whamcloud - gitweb
Branch b1_4
authornathan <nathan>
Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)
committernathan <nathan>
Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)
b=5921
r=adilger
Land b1_4_5921 ping evictor

16 files changed:
lustre/ChangeLog
lustre/include/linux/lustre_export.h
lustre/include/linux/obd.h
lustre/include/linux/obd_class.h
lustre/include/linux/obd_support.h
lustre/mds/handler.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/obd_config.c
lustre/obdfilter/filter.c
lustre/ptlrpc/import.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pinger.c
lustre/ptlrpc/service.c
lustre/tests/recovery-small.sh
lustre/tests/test-framework.sh

index ace07ea..42187af 100644 (file)
@@ -19,6 +19,7 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
        - don't hold i_size_sem in ll_nopage() and ll_ap_refresh_count (6077)
        - don't hold client locks on temporary worklist from l_lru (5666)
        - handle IO errors in 2.6 obdfilter bio completion routine (6046)
+       - automatically evict dead clients (5921)
        * miscellania
        - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs
        - allow --write-conf on an MDS with different nettype than client (5619)
index 5136d66..d06af11 100644 (file)
@@ -55,6 +55,7 @@ struct obd_export {
         atomic_t                  exp_refcount;
         struct obd_uuid           exp_client_uuid;
         struct list_head          exp_obd_chain;
+        struct list_head          exp_obd_chain_timed; /* for ping evictor */
         struct obd_device        *exp_obd;
         struct obd_import        *exp_imp_reverse; /* to make RPCs backwards */
         struct ptlrpc_connection *exp_connection;
index 4dca21b..3715578 100644 (file)
@@ -525,6 +525,8 @@ struct obd_device {
         struct llog_ctxt        *obd_llog_ctxt[LLOG_MAX_CTXTS];
         struct obd_device       *obd_observer;
         struct obd_export       *obd_self_export;
+        struct list_head        obd_exports_timed;  /* for ping evictor */
+        time_t                  obd_eviction_timer; /* for ping evictor */
 
         /* XXX encapsulate all this recovery data into one struct */
         svc_handler_t                    obd_recovery_handler;
index e7d1afd..0a12a08 100644 (file)
@@ -130,6 +130,7 @@ do {                                                                           \
 void __class_export_put(struct obd_export *);
 struct obd_export *class_new_export(struct obd_device *obddev);
 void class_unlink_export(struct obd_export *exp);
+void class_update_export_timer(struct obd_export *exp, time_t extra_delay);
 
 struct obd_import *class_import_get(struct obd_import *);
 void class_import_put(struct obd_import *);
@@ -159,6 +160,11 @@ void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
 int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare);
 void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
 
+/* ping evictor */
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+
+
 #define OBT(dev)        (dev)->obd_type
 #define OBP(dev, op)    (dev)->obd_type->typ_ops->o_ ## op
 #define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
index 2c54309..c45aa38 100644 (file)
@@ -37,7 +37,8 @@ extern atomic_t obd_memory;
 extern int obd_memmax;
 extern unsigned int obd_fail_loc;
 extern unsigned int obd_dump_on_timeout;
-extern unsigned int obd_timeout;
+extern unsigned int obd_timeout;          /* seconds */
+#define PING_INTERVAL (obd_timeout / 4)
 extern unsigned int ldlm_timeout;
 extern char obd_lustre_upcall[128];
 extern unsigned int obd_sync_filter;
@@ -146,6 +147,7 @@ extern wait_queue_head_t obd_race_waitq;
 #define OBD_FAIL_PTLRPC_RQBD             0x502
 #define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
 #define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC         0x505
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
index 42367a2..f7739bc 100644 (file)
@@ -1763,6 +1763,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                               obd->obd_replayable ? "enabled" : "disabled");
         }
 
+        ping_evictor_start();
+        
         sema_init(&mds->mds_quota_info.qi_sem, 1);
         rc = qctxt_init(&mds->mds_quota_ctxt, mds->mds_sb, dqacq_handler);
         if (rc) {
@@ -1951,6 +1953,8 @@ static int mds_cleanup(struct obd_device *obd)
         int must_relock = 0;
         ENTRY;
 
+        ping_evictor_stop();
+
         if (mds->mds_sb == NULL)
                 RETURN(0);
         save_dev = ll_sbdev(mds->mds_sb);
index 201bba5..4d01896 100644 (file)
@@ -444,6 +444,7 @@ EXPORT_SYMBOL(class_conn2cliimp);
 EXPORT_SYMBOL(class_disconnect);
 EXPORT_SYMBOL(class_disconnect_exports);
 EXPORT_SYMBOL(class_disconnect_stale_exports);
+EXPORT_SYMBOL(class_update_export_timer);
 
 EXPORT_SYMBOL(oig_init);
 EXPORT_SYMBOL(oig_release);
@@ -451,6 +452,9 @@ EXPORT_SYMBOL(oig_add_one);
 EXPORT_SYMBOL(oig_wait);
 EXPORT_SYMBOL(oig_complete_one);
 
+EXPORT_SYMBOL(ping_evictor_start);
+EXPORT_SYMBOL(ping_evictor_stop);
+
 /* uuid.c */
 EXPORT_SYMBOL(class_uuid_unparse);
 EXPORT_SYMBOL(lustre_uuid_to_peer);
index db8500f..83de073 100644 (file)
@@ -550,14 +550,18 @@ struct obd_export *class_new_export(struct obd_device *obd)
 
         INIT_LIST_HEAD(&export->exp_handle.h_link);
         class_handle_hash(&export->exp_handle, export_handle_addref);
+        export->exp_last_request_time = CURRENT_SECONDS;
         spin_lock_init(&export->exp_lock);
 
         spin_lock(&obd->obd_dev_lock);
         LASSERT(!obd->obd_stopping); /* shouldn't happen, but might race */
         atomic_inc(&obd->obd_refcount);
         list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+        list_add_tail(&export->exp_obd_chain_timed, 
+                      &export->exp_obd->obd_exports_timed);
         export->exp_obd->obd_num_exports++;
         spin_unlock(&obd->obd_dev_lock);
+
         obd_init_export(export);
         return export;
 }
@@ -568,6 +572,7 @@ void class_unlink_export(struct obd_export *exp)
 
         spin_lock(&exp->exp_obd->obd_dev_lock);
         list_del_init(&exp->exp_obd_chain);
+        list_del_init(&exp->exp_obd_chain_timed);
         exp->exp_obd->obd_num_exports--;
         spin_unlock(&exp->exp_obd->obd_dev_lock);
 
@@ -948,3 +953,241 @@ int oig_wait(struct obd_io_group *oig)
         CDEBUG(D_CACHE, "done waiting on oig %p rc %d\n", oig, oig->oig_rc);
         return oig->oig_rc;
 }
+
+                
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+#define D_PET D_HA
+
+static int               pet_refcount = 0;
+static int               pet_state;
+static wait_queue_head_t pet_waitq;
+static struct obd_export *pet_exp = NULL;
+static spinlock_t        pet_lock = SPIN_LOCK_UNLOCKED;
+
+static int ping_evictor_wake(struct obd_export *exp)
+{
+#ifdef __KERNEL__
+        spin_lock(&pet_lock);
+        if (pet_exp) {
+                /* eventually the new obd will call here again. */
+                spin_unlock(&pet_lock);
+                return 1;
+        }
+        pet_exp = exp;
+        spin_unlock(&pet_lock);
+
+        /* We have to make sure the obd isn't destroyed between now and when
+           the ping evictor runs. We'll take a reference here, and drop it
+           when we finish in the evictor.  We don't really care about this
+           export in particular; we just need one to keep the obd. */
+        class_export_get(pet_exp);
+        wake_up(&pet_waitq);
+#endif
+        return 0;
+}
+               
+#ifdef __KERNEL__
+/* Same as ptlrpc_fail_export, but this module must load first... */
+void ping_evictor_fail_export(struct obd_export *exp)
+{
+        int rc, already_failed;
+        unsigned long flags;
+
+        spin_lock_irqsave(&exp->exp_lock, flags);
+        already_failed = exp->exp_failed;
+        exp->exp_failed = 1;
+        spin_unlock_irqrestore(&exp->exp_lock, flags);
+
+        if (already_failed) {
+                CDEBUG(D_PET, "disconnecting dead export %p/%s; skipping\n",
+                       exp, exp->exp_client_uuid.uuid);
+                return;
+        }
+
+        CDEBUG(D_PET, "disconnecting export %p/%s\n",
+               exp, exp->exp_client_uuid.uuid);
+
+        /* Most callers into obd_disconnect are removing their own reference
+         * (request, for example) in addition to the one from the hash table.
+         * We don't have such a reference here, so make one. */
+        class_export_get(exp);
+        rc = obd_disconnect(exp);
+        if (rc)
+                CERROR("disconnecting export %p failed: %d\n", exp, rc);
+        CERROR("disconnected export %p/%s\n",
+               exp, exp->exp_client_uuid.uuid);
+}
+
+static int ping_evictor_main(void *arg)
+{
+        struct list_head *pos, *n;
+        struct obd_device *obd;
+        struct obd_export *exp;
+        struct l_wait_info lwi = { 0 };
+        time_t expire_time;
+        unsigned long flags;
+        ENTRY;
+
+        lock_kernel();
+        kportal_daemonize("ping_evictor");
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+        unlock_kernel();
+
+        CDEBUG(D_PET, "Starting Ping Evictor\n");
+        pet_exp = NULL;
+        pet_state = PET_READY;
+        while (1) {
+                l_wait_event(pet_waitq, pet_exp ||
+                             (pet_state == PET_TERMINATE), &lwi);
+                if (pet_state == PET_TERMINATE)
+                        break;
+                
+                obd = pet_exp->exp_obd;
+                expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
+                
+                CDEBUG(D_PET, "evicting all exports of obd %s older than %ld\n",
+                       obd->obd_name, expire_time);
+                
+                /* Exports can't be deleted out of the list, which means we 
+                   can't lose the last ref on the export, while we hold the obd 
+                   lock (class_unlink_export).  If they've already been
+                   removed from the list, we won't find them here. */
+                spin_lock(&obd->obd_dev_lock);
+                list_for_each_safe(pos, n, &obd->obd_exports_timed) {
+                        int stop = 0;
+                        exp = list_entry(pos, struct obd_export, 
+                                         exp_obd_chain_timed);
+                        class_export_get(exp);
+                        spin_unlock(&obd->obd_dev_lock);
+
+                        if (expire_time > exp->exp_last_request_time) {
+                                LCONSOLE_WARN("%s hasn't heard from %s in %ld "
+                                              "seconds.  I think it's dead, "
+                                              "and I am evicting it.\n",
+                                              obd->obd_name,
+                                              exp->exp_client_uuid.uuid,
+                                              (long)(CURRENT_SECONDS - 
+                                              exp->exp_last_request_time));
+                                ping_evictor_fail_export(exp);
+                        } else {
+                                /* List is sorted, so everyone below is ok */
+                                stop++;
+                        }
+                        class_export_put(exp);
+                        /* lock again for the next entry */
+                        spin_lock(&obd->obd_dev_lock);
+                        
+                        if (stop) 
+                                break;
+                }
+                spin_unlock(&obd->obd_dev_lock);
+                class_export_put(pet_exp);
+                pet_exp = NULL;
+        }
+        CDEBUG(D_PET, "Exiting Ping Evictor\n");
+
+        RETURN(0);
+}
+#endif 
+
+void ping_evictor_start(void)
+{
+#ifdef __KERNEL__
+        int rc;
+
+        if (++pet_refcount > 1)
+                return;
+
+        init_waitqueue_head(&pet_waitq);
+
+        rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS);
+        if (rc < 0) {
+                pet_refcount--;
+                CERROR("Cannot start ping evictor thread: %d\n", rc);
+        }
+#endif
+}
+
+void ping_evictor_stop(void)
+{
+#ifdef __KERNEL__
+        if (--pet_refcount > 0)
+                return;
+
+        pet_state = PET_TERMINATE;
+        wake_up(&pet_waitq);
+#endif
+}
+
+/* This function makes sure dead exports are evicted in a timely manner. 
+   This function is only called when some export receives a message (i.e.,
+   the network is up.) */
+void class_update_export_timer(struct obd_export *exp, time_t extra_delay)
+{
+        LASSERT(exp);
+
+        /* Compensate for slow machines, etc, by faking our request time
+           into the future.  Although this can break the strict time-ordering
+           of the list, we can be really lazy here - we don't have to evict
+           at the exact right moment.  Eventually, all silent exports 
+           will make it to the top of the list. */         
+        exp->exp_last_request_time = max(exp->exp_last_request_time,
+                                         (time_t)CURRENT_SECONDS + extra_delay);
+
+        CDEBUG(D_PET, "updating export %s at %ld\n",
+               exp->exp_client_uuid.uuid,
+               exp->exp_last_request_time);
+
+        /* exports may get disconnected from the chain even though the 
+           export has references, so we must keep the spin lock while
+           manipulating the lists */
+        spin_lock(&exp->exp_obd->obd_dev_lock);
+
+        if (list_empty(&exp->exp_obd_chain_timed)) {
+                /* this one is not timed */
+                spin_unlock(&exp->exp_obd->obd_dev_lock);
+                return;
+        }
+
+        list_move_tail(&exp->exp_obd_chain_timed, 
+                       &exp->exp_obd->obd_exports_timed);
+        
+        /* Note - racing to start/reset the obd_eviction timer is safe */
+        if (exp->exp_obd->obd_eviction_timer == 0) { 
+                struct obd_export *oldest_exp;
+                /* Check if the oldest entry is expired. */
+                oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+                                        struct obd_export, exp_obd_chain_timed);
+                spin_unlock(&exp->exp_obd->obd_dev_lock);
+                
+                if (CURRENT_SECONDS > (oldest_exp->exp_last_request_time +
+                                       (3 * obd_timeout / 2) + extra_delay)) {
+                        /* We need a second timer, in case the net was
+                           down and it just came back. Since the pinger
+                           may skip every other PING_INTERVAL (see note in
+                           ptlrpc_pinger_main), we better wait for 3. */
+                        exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS + 
+                                3 * PING_INTERVAL;
+                        CDEBUG(D_PET,
+                               "Thinking about evicting old export %s at %ld\n",
+                               oldest_exp->exp_client_uuid.uuid,
+                               oldest_exp->exp_last_request_time);
+                }
+        } else {
+                spin_unlock(&exp->exp_obd->obd_dev_lock);
+                if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
+                                       extra_delay)) {
+                        /* The evictor won't evict anyone who we've heard from
+                           recently, so we don't have to check before we start
+                           it. */
+                        if (!ping_evictor_wake(exp))
+                                exp->exp_obd->obd_eviction_timer = 0;
+                }
+        }
+}
+
index 9277d2a..060e675 100644 (file)
@@ -95,6 +95,7 @@ int class_attach(struct lustre_cfg *lcfg)
         cleanup_phase = 3;  /* class_release_dev */
 
         INIT_LIST_HEAD(&obd->obd_exports);
+        INIT_LIST_HEAD(&obd->obd_exports_timed);
         obd->obd_num_exports = 0;
         spin_lock_init(&obd->obd_dev_lock);
         spin_lock_init(&obd->obd_osfs_lock);
@@ -185,6 +186,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         memcpy(&exp->exp_client_uuid, &obd->obd_uuid,
                sizeof(exp->exp_client_uuid));
         obd->obd_self_export = exp;
+        list_del_init(&exp->exp_obd_chain_timed);
         class_export_put(exp);
 
         err = obd_setup(obd, sizeof(*lcfg), lcfg);
@@ -333,7 +335,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
                         GOTO(out, err = -EBUSY);
                 }
                 CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
-                       obd->obd_name, atomic_read(&obd->obd_refcount));
+                       obd->obd_name, atomic_read(&obd->obd_refcount) - 1);
                 dump_exports(obd);
                 class_disconnect_exports(obd);
         }
index 4dcf70e..4785a27 100644 (file)
@@ -1349,6 +1349,8 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
                 lproc_filter_attach_seqstat(obd);
         }
 
+        ping_evictor_start();
+
         return rc;
 }
 
@@ -1378,6 +1380,8 @@ static int filter_cleanup(struct obd_device *obd)
                 }
         }
 
+        ping_evictor_stop();
+
         qctxt_cleanup(&filter->fo_quota_ctxt, 0);
 
         ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
@@ -1413,7 +1417,6 @@ static int filter_cleanup(struct obd_device *obd)
         //destroy_buffers(filter->fo_sb->s_dev);
         filter->fo_sb = NULL;
 
-
         ll_clear_rdonly(save_dev);
 
         if (must_relock)
index 3a79e8a..df39056 100644 (file)
@@ -431,11 +431,14 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
 
         msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
 
+        /* All imports are pingable */
+        imp->imp_pingable = 1;
+        
         if (aa->pcaa_initial_connect) {
                 if (msg_flags & MSG_CONNECT_REPLAYABLE) {
                         CDEBUG(D_HA, "connected to replayable target: %s\n",
                                imp->imp_target_uuid.uuid);
-                        imp->imp_pingable = imp->imp_replayable = 1;
+                        imp->imp_replayable = 1;
                 } else {
                         imp->imp_replayable = 0;
                 }
index ee2257e..73a5e47 100644 (file)
@@ -392,6 +392,8 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         ptl_md_t         reply_md;
         ENTRY;
 
+        OBD_FAIL_RETURN(OBD_FAIL_PTLRPC_DROP_RPC, 0); 
+
         LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST);
 
         /* If this is a re-transmit, we're required to have disengaged
@@ -406,7 +408,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                 request->rq_err = 1;
                 RETURN(-ENODEV);
         }
-
+        
         connection = request->rq_import->imp_connection;
 
         if (request->rq_bulk != NULL) {
index 05172e2..4b79c69 100644 (file)
@@ -66,7 +66,7 @@ int ptlrpc_ping(struct obd_import *imp)
 
 static inline void ptlrpc_update_next_ping(struct obd_import *imp)
 {
-        imp->imp_next_ping = jiffies + obd_timeout * HZ;
+        imp->imp_next_ping = jiffies + PING_INTERVAL * HZ;
 }
 
 #ifdef __KERNEL__
@@ -99,7 +99,7 @@ static int ptlrpc_pinger_main(void *arg)
         while (1) {
                 unsigned long this_ping = jiffies;
                 long time_to_next_ping;
-                struct l_wait_info lwi = LWI_TIMEOUT(obd_timeout * HZ,
+                struct l_wait_info lwi = LWI_TIMEOUT(PING_INTERVAL * HZ,
                                                      NULL, NULL);
                 struct list_head *iter;
 
@@ -120,12 +120,15 @@ static int ptlrpc_pinger_main(void *arg)
                         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
                         if (force ||
-                            time_after_eq(this_ping, imp->imp_next_ping)) {
+                            /* if the next ping is within, say, 5 jiffies from
+                               now, go ahead and ping. See note below. */
+                            time_after_eq(this_ping, imp->imp_next_ping - 5)) {
                                 if (level == LUSTRE_IMP_DISCON &&
                                     !imp->imp_deactive) {
                                         /* wait at least a timeout before
                                            trying recovery again. */
-                                        ptlrpc_update_next_ping(imp);
+                                        imp->imp_next_ping = jiffies + 
+                                                obd_timeout * HZ;
                                         ptlrpc_initiate_recovery(imp);
                                 }
                                 else if (level != LUSTRE_IMP_FULL ||
@@ -140,25 +143,32 @@ static int ptlrpc_pinger_main(void *arg)
                                         ptlrpc_ping(imp);
                                 }
 
-                        } else if (!imp->imp_pingable) {
-                                continue;
+                        } else {
+                                if (!imp->imp_pingable) 
+                                        continue;
+                                CDEBUG(D_HA, 
+                                       "don't need to ping %s (%lu > %lu)\n",
+                                       imp->imp_target_uuid.uuid,
+                                       imp->imp_next_ping, this_ping);
                         }
 
-                        CDEBUG(D_HA, "don't need to ping %s (%lu > %lu)\n",
-                               imp->imp_target_uuid.uuid,
-                               imp->imp_next_ping, this_ping);
-
                         /* obd_timeout might have changed */
                         if (time_after(imp->imp_next_ping,
-                                       this_ping + obd_timeout * HZ))
+                                       this_ping + PING_INTERVAL * HZ))
                                 ptlrpc_update_next_ping(imp);
                 }
                 up(&pinger_sem);
 
                 /* Wait until the next ping time, or until we're stopped. */
-                time_to_next_ping = this_ping + (obd_timeout * HZ) - jiffies;
+                time_to_next_ping = this_ping + (PING_INTERVAL * HZ) - jiffies;
+                /* The ping sent by ptlrpc_send_rpc may get sent out
+                   say .01 second after this.  
+                   ptlrpc_pinger_sending_on_import will then set the
+                   next ping time to next_ping + .01 sec, which means 
+                   we will SKIP the next ping at next_ping, and the
+                   ping will get sent 2 timeouts from now!  Beware. */
                 CDEBUG(D_HA, "next ping in %lu (%lu)\n", time_to_next_ping,
-                       this_ping + obd_timeout * HZ);
+                       this_ping + PING_INTERVAL * HZ);
                 if (time_to_next_ping > 0) {
                         lwi = LWI_TIMEOUT(time_to_next_ping, NULL, NULL);
                         l_wait_event(thread->t_ctl_waitq,
@@ -346,7 +356,7 @@ static int pinger_check_rpcs(void *arg)
                 int generation, level;
                 unsigned long flags;
 
-                if (time_after_eq(pd->pd_this_ping, imp->imp_next_ping)) {
+                if (time_after_eq(pd->pd_this_ping, imp->imp_next_ping - 5)) {
                         /* Add a ping. */
                         spin_lock_irqsave(&imp->imp_lock, flags);
                         generation = imp->imp_generation;
@@ -399,7 +409,7 @@ do_check_set:
         rc = ptlrpc_check_set(set);
 
         /* not finished, and we are not expired, simply return */
-        if (!rc && time_before(curtime, pd->pd_this_ping + obd_timeout * HZ)) {
+        if (!rc && time_before(curtime, pd->pd_this_ping + PING_INTERVAL * HZ)) {
                 CDEBUG(D_HA, "not finished, but also not expired\n");
                 pd->pd_recursion--;
                 return 0;
@@ -430,7 +440,7 @@ do_check_set:
         ptlrpc_set_destroy(set);
         pd->pd_set = NULL;
 
-        pd->pd_next_ping = pd->pd_this_ping + obd_timeout * HZ;
+        pd->pd_next_ping = pd->pd_this_ping + PING_INTERVAL * HZ;
         pd->pd_this_ping = 0; /* XXX for debug */
 
         CDEBUG(D_HA, "finished a round ping\n");
index 829c078..1702e0b 100644 (file)
@@ -443,6 +443,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
         int                    rc;
         ENTRY;
 
+        LASSERT(svc);
+
         spin_lock_irqsave (&svc->srv_lock, flags);
         if (list_empty (&svc->srv_request_queue) ||
             (svc->srv_n_difficult_replies != 0 &&
@@ -494,17 +496,6 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
 
         CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
 
-        /* Discard requests queued for longer than my timeout.  If the
-         * client's timeout is similar to mine, she'll be timing out this
-         * REQ anyway (bug 1502) */
-        if (timediff / 1000000 > (long)obd_timeout) {
-                CERROR("Dropping timed-out opc %d request from %s"
-                       ": %ld seconds old\n", request->rq_reqmsg->opc,
-                       request->rq_peerstr,
-                       timediff / 1000000);
-                goto out;
-        }
-
         request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
 
         if (request->rq_export) {
@@ -527,7 +518,19 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
                         goto put_conn;
                 }
 
-                request->rq_export->exp_last_request_time = CURRENT_SECONDS;
+                class_update_export_timer(request->rq_export, 
+                                          (time_t)(timediff / 1000000));
+        }
+
+        /* Discard requests queued for longer than my timeout.  If the
+         * client's timeout is similar to mine, she'll be timing out this
+         * REQ anyway (bug 1502) */
+        if (timediff / 1000000 > (long)obd_timeout) {
+                CERROR("Dropping timed-out opc %d request from %s"
+                       ": %ld seconds old\n", request->rq_reqmsg->opc,
+                       request->rq_peerstr,
+                       timediff / 1000000);
+                goto put_conn;
         }
 
         request->rq_phase = RQ_PHASE_INTERPRET;
index 2980fdf..2c1fbd9 100755 (executable)
@@ -395,7 +395,33 @@ test_24() {        # bug 2248 - eviction fails writeback but app doesn't see it
 }
 run_test 24 "fsync error (should return error)" 
 
-test_25a() {
+test_26() {      # bug 5921 - evict dead exports 
+# this test can only run from a client on a separate node.
+       [ "`lsmod | grep obdfilter`" ] && \
+           echo "skipping test 26 (local OST)" && return
+       [ "`lsmod | grep mds`" ] && \
+           echo "skipping test 26 (local MDS)" && return
+       OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports
+        OST_EXP="`do_facet ost cat $OST_FILE`"
+       OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
+       echo starting with $OST_NEXP1 OST exports
+# OBD_FAIL_PTLRPC_DROP_RPC 0x505
+       do_facet client sysctl -w lustre.fail_loc=0x505
+       # evictor takes up to 2.25x to evict.  But if there's a 
+       # race to start the evictor from various obds, the loser
+       # might have to wait for the next ping.
+       echo Waiting for $(($TIMEOUT * 4)) secs
+       sleep $(($TIMEOUT * 4))
+        OST_EXP="`do_facet ost cat $OST_FILE`"
+       OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
+       echo ending with $OST_NEXP2 OST exports
+       do_facet client sysctl -w lustre.fail_loc=0x0
+        [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted"
+       return 0
+}
+run_test 26 "evict dead exports"
+
+test_50() {     # bug 4834 - failover under load failures
        mkdir -p $DIR/$tdir
        # put a load of file creates/writes/deletes for 10 min.
        do_facet client "writemany -q -a $DIR/$tdir/$tfile 600 5" &
@@ -415,9 +441,9 @@ test_25a() {
        echo writemany returned $rc
        return $rc
 }
-run_test 25a "failover MDS under load"
+run_test 50 "failover MDS under load"
 
-test_25b() {
+test_51() {
        mkdir -p $DIR/$tdir
        # put a load of file creates/writes/deletes
        do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" &
@@ -442,9 +468,9 @@ test_25b() {
        echo writemany returned $rc
        return $rc
 }
-run_test 25b "failover MDS during recovery"
+run_test 51 "failover MDS during recovery"
 
-test_25c_guts() {
+test_52_guts() {
        do_facet client "writemany -q $DIR/$tdir/$tfile 600 5" &
        CLIENT_PID=$!
        echo writemany pid $CLIENT_PID
@@ -461,22 +487,23 @@ test_25c_guts() {
        return $rc
 }
 
-test_25c() {
+test_52() {
        mkdir -p $DIR/$tdir
-       test_25c_guts
+       test_52_guts
        rc=$?
        [ $rc -ne 0 ] && { return $rc; }
        # wait for client to reconnect to OST
        sleep 30
-       test_25c_guts
+       test_52_guts
        rc=$?
        [ $rc -ne 0 ] && { return $rc; }
        sleep 30
-       test_25c_guts
+       test_52_guts
        rc=$?
        client_reconnect
        return $rc
 }
-run_test 25c "failover OST under load"
+run_test 52 "failover OST under load"
+
 
 FORCE=--force $CLEANUP
index 6173ecd..184b18c 100644 (file)
@@ -151,13 +151,13 @@ client_df() {
 }
 
 client_reconnect() {
-    df $MOUNT > /dev/null
     uname -n >> $MOUNT/recon
     if [ ! -z "$CLIENTS" ]; then
        $PDSH $CLIENTS "df $MOUNT; uname -n >> $MOUNT/recon" > /dev/null
     fi
     echo Connected clients:
     cat $MOUNT/recon
+    ls -l $MOUNT/recon > /dev/null
     rm $MOUNT/recon
 }