Branch b1_4

author nathan <nathan>

Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)

committer nathan <nathan>

Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)
author nathan <nathan>
Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)
committer nathan <nathan>
Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index ace07ea..42187af 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -19,6 +19,7 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
         - don't hold i_size_sem in ll_nopage() and ll_ap_refresh_count (6077)
         - don't hold client locks on temporary worklist from l_lru (5666)
         - handle IO errors in 2.6 obdfilter bio completion routine (6046)
+       - automatically evict dead clients (5921)
         * miscellania
         - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs
         - allow --write-conf on an MDS with different nettype than client (5619)
diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h

index 5136d66..d06af11 100644 (file)
--- a/lustre/include/linux/lustre_export.h
+++ b/lustre/include/linux/lustre_export.h
@@ -55,6 +55,7 @@ struct obd_export {
          atomic_t                  exp_refcount;
          struct obd_uuid           exp_client_uuid;
          struct list_head          exp_obd_chain;
+        struct list_head          exp_obd_chain_timed; /* for ping evictor */
          struct obd_device        *exp_obd;
          struct obd_import        *exp_imp_reverse; /* to make RPCs backwards */
          struct ptlrpc_connection *exp_connection;
diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h

index 4dca21b..3715578 100644 (file)
--- a/lustre/include/linux/obd.h
+++ b/lustre/include/linux/obd.h
@@ -525,6 +525,8 @@ struct obd_device {
          struct llog_ctxt        *obd_llog_ctxt[LLOG_MAX_CTXTS];
          struct obd_device       *obd_observer;
          struct obd_export       *obd_self_export;
+        struct list_head        obd_exports_timed;  /* for ping evictor */
+        time_t                  obd_eviction_timer; /* for ping evictor */
  
          /* XXX encapsulate all this recovery data into one struct */
          svc_handler_t                    obd_recovery_handler;
diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h

index e7d1afd..0a12a08 100644 (file)
--- a/lustre/include/linux/obd_class.h
+++ b/lustre/include/linux/obd_class.h
@@ -130,6 +130,7 @@ do {                                                                           \
  void __class_export_put(struct obd_export *);
  struct obd_export *class_new_export(struct obd_device *obddev);
  void class_unlink_export(struct obd_export *exp);
+void class_update_export_timer(struct obd_export *exp, time_t extra_delay);
  
  struct obd_import *class_import_get(struct obd_import *);
  void class_import_put(struct obd_import *);
@@ -159,6 +160,11 @@ void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
  int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare);
  void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
  
+/* ping evictor */
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+
+
  #define OBT(dev)        (dev)->obd_type
  #define OBP(dev, op)    (dev)->obd_type->typ_ops->o_ ## op
  #define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h

index 2c54309..c45aa38 100644 (file)
--- a/lustre/include/linux/obd_support.h
+++ b/lustre/include/linux/obd_support.h
@@ -37,7 +37,8 @@ extern atomic_t obd_memory;
  extern int obd_memmax;
  extern unsigned int obd_fail_loc;
  extern unsigned int obd_dump_on_timeout;
-extern unsigned int obd_timeout;
+extern unsigned int obd_timeout;          /* seconds */
+#define PING_INTERVAL (obd_timeout / 4)
  extern unsigned int ldlm_timeout;
  extern char obd_lustre_upcall[128];
  extern unsigned int obd_sync_filter;
@@ -146,6 +147,7 @@ extern wait_queue_head_t obd_race_waitq;
  #define OBD_FAIL_PTLRPC_RQBD             0x502
  #define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
  #define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC         0x505
  
  #define OBD_FAIL_OBD_PING_NET            0x600
  #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c

index 42367a2..f7739bc 100644 (file)
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -1763,6 +1763,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                                obd->obd_replayable ? "enabled" : "disabled");
          }
  
+        ping_evictor_start();
+        
          sema_init(&mds->mds_quota_info.qi_sem, 1);
          rc = qctxt_init(&mds->mds_quota_ctxt, mds->mds_sb, dqacq_handler);
          if (rc) {
@@ -1951,6 +1953,8 @@ static int mds_cleanup(struct obd_device *obd)
          int must_relock = 0;
          ENTRY;
  
+        ping_evictor_stop();
+
          if (mds->mds_sb == NULL)
                  RETURN(0);
          save_dev = ll_sbdev(mds->mds_sb);
diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c

index 201bba5..4d01896 100644 (file)
--- a/lustre/obdclass/class_obd.c
+++ b/lustre/obdclass/class_obd.c
@@ -444,6 +444,7 @@ EXPORT_SYMBOL(class_conn2cliimp);
  EXPORT_SYMBOL(class_disconnect);
  EXPORT_SYMBOL(class_disconnect_exports);
  EXPORT_SYMBOL(class_disconnect_stale_exports);
+EXPORT_SYMBOL(class_update_export_timer);
  
  EXPORT_SYMBOL(oig_init);
  EXPORT_SYMBOL(oig_release);
@@ -451,6 +452,9 @@ EXPORT_SYMBOL(oig_add_one);
  EXPORT_SYMBOL(oig_wait);
  EXPORT_SYMBOL(oig_complete_one);
  
+EXPORT_SYMBOL(ping_evictor_start);
+EXPORT_SYMBOL(ping_evictor_stop);
+
  /* uuid.c */
  EXPORT_SYMBOL(class_uuid_unparse);
  EXPORT_SYMBOL(lustre_uuid_to_peer);
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c

index db8500f..83de073 100644 (file)
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -550,14 +550,18 @@ struct obd_export *class_new_export(struct obd_device *obd)
  
          INIT_LIST_HEAD(&export->exp_handle.h_link);
          class_handle_hash(&export->exp_handle, export_handle_addref);
+        export->exp_last_request_time = CURRENT_SECONDS;
          spin_lock_init(&export->exp_lock);
  
          spin_lock(&obd->obd_dev_lock);
          LASSERT(!obd->obd_stopping); /* shouldn't happen, but might race */
          atomic_inc(&obd->obd_refcount);
          list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+        list_add_tail(&export->exp_obd_chain_timed, 
+                      &export->exp_obd->obd_exports_timed);
          export->exp_obd->obd_num_exports++;
          spin_unlock(&obd->obd_dev_lock);
+
          obd_init_export(export);
          return export;
  }
@@ -568,6 +572,7 @@ void class_unlink_export(struct obd_export *exp)
  
          spin_lock(&exp->exp_obd->obd_dev_lock);
          list_del_init(&exp->exp_obd_chain);
+        list_del_init(&exp->exp_obd_chain_timed);
          exp->exp_obd->obd_num_exports--;
          spin_unlock(&exp->exp_obd->obd_dev_lock);
  
@@ -948,3 +953,241 @@ int oig_wait(struct obd_io_group *oig)
          CDEBUG(D_CACHE, "done waiting on oig %p rc %d\n", oig, oig->oig_rc);
          return oig->oig_rc;
  }
+
+                
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+#define D_PET D_HA
+
+static int               pet_refcount = 0;
+static int               pet_state;
+static wait_queue_head_t pet_waitq;
+static struct obd_export *pet_exp = NULL;
+static spinlock_t        pet_lock = SPIN_LOCK_UNLOCKED;
+
+static int ping_evictor_wake(struct obd_export *exp)
+{
+#ifdef __KERNEL__
+        spin_lock(&pet_lock);
+        if (pet_exp) {
+                /* eventually the new obd will call here again. */
+                spin_unlock(&pet_lock);
+                return 1;
+        }
+        pet_exp = exp;
+        spin_unlock(&pet_lock);
+
+        /* We have to make sure the obd isn't destroyed between now and when
+           the ping evictor runs. We'll take a reference here, and drop it
+           when we finish in the evictor.  We don't really care about this
+           export in particular; we just need one to keep the obd. */
+        class_export_get(pet_exp);
+        wake_up(&pet_waitq);
+#endif
+        return 0;
+}
+               
+#ifdef __KERNEL__
+/* Same as ptlrpc_fail_export, but this module must load first... */
+void ping_evictor_fail_export(struct obd_export *exp)
+{
+        int rc, already_failed;
+        unsigned long flags;
+
+        spin_lock_irqsave(&exp->exp_lock, flags);
+        already_failed = exp->exp_failed;
+        exp->exp_failed = 1;
+        spin_unlock_irqrestore(&exp->exp_lock, flags);
+
+        if (already_failed) {
+                CDEBUG(D_PET, "disconnecting dead export %p/%s; skipping\n",
+                       exp, exp->exp_client_uuid.uuid);
+                return;
+        }
+
+        CDEBUG(D_PET, "disconnecting export %p/%s\n",
+               exp, exp->exp_client_uuid.uuid);
+
+        /* Most callers into obd_disconnect are removing their own reference
+         * (request, for example) in addition to the one from the hash table.
+         * We don't have such a reference here, so make one. */
+        class_export_get(exp);
+        rc = obd_disconnect(exp);
+        if (rc)
+                CERROR("disconnecting export %p failed: %d\n", exp, rc);
+        CERROR("disconnected export %p/%s\n",
+               exp, exp->exp_client_uuid.uuid);
+}
+
+static int ping_evictor_main(void *arg)
+{
+        struct list_head *pos, *n;
+        struct obd_device *obd;
+        struct obd_export *exp;
+        struct l_wait_info lwi = { 0 };
+        time_t expire_time;
+        unsigned long flags;
+        ENTRY;
+
+        lock_kernel();
+        kportal_daemonize("ping_evictor");
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+        unlock_kernel();
+
+        CDEBUG(D_PET, "Starting Ping Evictor\n");
+        pet_exp = NULL;
+        pet_state = PET_READY;
+        while (1) {
+                l_wait_event(pet_waitq, pet_exp ||
+                             (pet_state == PET_TERMINATE), &lwi);
+                if (pet_state == PET_TERMINATE)
+                        break;
+                
+                obd = pet_exp->exp_obd;
+                expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
+                
+                CDEBUG(D_PET, "evicting all exports of obd %s older than %ld\n",
+                       obd->obd_name, expire_time);
+                
+                /* Exports can't be deleted out of the list, which means we 
+                   can't lose the last ref on the export, while we hold the obd 
+                   lock (class_unlink_export).  If they've already been
+                   removed from the list, we won't find them here. */
+                spin_lock(&obd->obd_dev_lock);
+                list_for_each_safe(pos, n, &obd->obd_exports_timed) {
+                        int stop = 0;
+                        exp = list_entry(pos, struct obd_export, 
+                                         exp_obd_chain_timed);
+                        class_export_get(exp);
+                        spin_unlock(&obd->obd_dev_lock);
+
+                        if (expire_time > exp->exp_last_request_time) {
+                                LCONSOLE_WARN("%s hasn't heard from %s in %ld "
+                                              "seconds.  I think it's dead, "
+                                              "and I am evicting it.\n",
+                                              obd->obd_name,
+                                              exp->exp_client_uuid.uuid,
+                                              (long)(CURRENT_SECONDS - 
+                                              exp->exp_last_request_time));
+                                ping_evictor_fail_export(exp);
+                        } else {
+                                /* List is sorted, so everyone below is ok */
+                                stop++;
+                        }
+                        class_export_put(exp);
+                        /* lock again for the next entry */
+                        spin_lock(&obd->obd_dev_lock);
+                        
+                        if (stop) 
+                                break;
+                }
+                spin_unlock(&obd->obd_dev_lock);
+                class_export_put(pet_exp);
+                pet_exp = NULL;
+        }
+        CDEBUG(D_PET, "Exiting Ping Evictor\n");
+
+        RETURN(0);
+}
+#endif 
+
+void ping_evictor_start(void)
+{
+#ifdef __KERNEL__
+        int rc;
+
+        if (++pet_refcount > 1)
+                return;
+
+        init_waitqueue_head(&pet_waitq);
+
+        rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS);
+        if (rc < 0) {
+                pet_refcount--;
+                CERROR("Cannot start ping evictor thread: %d\n", rc);
+        }
+#endif
+}
+
+void ping_evictor_stop(void)
+{
+#ifdef __KERNEL__
+        if (--pet_refcount > 0)
+                return;
+
+        pet_state = PET_TERMINATE;
+        wake_up(&pet_waitq);
+#endif
+}
+
+/* This function makes sure dead exports are evicted in a timely manner. 
+   This function is only called when some export receives a message (i.e.,
+   the network is up.) */
+void class_update_export_timer(struct obd_export *exp, time_t extra_delay)
+{
+        LASSERT(exp);
+
+        /* Compensate for slow machines, etc, by faking our request time
+           into the future.  Although this can break the strict time-ordering
+           of the list, we can be really lazy here - we don't have to evict
+           at the exact right moment.  Eventually, all silent exports 
+           will make it to the top of the list. */         
+        exp->exp_last_request_time = max(exp->exp_last_request_time,
+                                         (time_t)CURRENT_SECONDS + extra_delay);
+
+        CDEBUG(D_PET, "updating export %s at %ld\n",
+               exp->exp_client_uuid.uuid,
+               exp->exp_last_request_time);
+
+        /* exports may get disconnected from the chain even though the 
+           export has references, so we must keep the spin lock while
+           manipulating the lists */
+        spin_lock(&exp->exp_obd->obd_dev_lock);
+
+        if (list_empty(&exp->exp_obd_chain_timed)) {
+                /* this one is not timed */
+                spin_unlock(&exp->exp_obd->obd_dev_lock);
+                return;
+        }
+
+        list_move_tail(&exp->exp_obd_chain_timed, 
+                       &exp->exp_obd->obd_exports_timed);
+        
+        /* Note - racing to start/reset the obd_eviction timer is safe */
+        if (exp->exp_obd->obd_eviction_timer == 0) { 
+                struct obd_export *oldest_exp;
+                /* Check if the oldest entry is expired. */
+                oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+                                        struct obd_export, exp_obd_chain_timed);
+                spin_unlock(&exp->exp_obd->obd_dev_lock);
+                
+                if (CURRENT_SECONDS > (oldest_exp->exp_last_request_time +
+                                       (3 * obd_timeout / 2) + extra_delay)) {
+                        /* We need a second timer, in case the net was
+                           down and it just came back. Since the pinger
+                           may skip every other PING_INTERVAL (see note in
+                           ptlrpc_pinger_main), we better wait for 3. */
+                        exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS + 
+                                3 * PING_INTERVAL;
+                        CDEBUG(D_PET,
+                               "Thinking about evicting old export %s at %ld\n",
+                               oldest_exp->exp_client_uuid.uuid,
+                               oldest_exp->exp_last_request_time);
+                }
+        } else {
+                spin_unlock(&exp->exp_obd->obd_dev_lock);
+                if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
+                                       extra_delay)) {
+                        /* The evictor won't evict anyone who we've heard from
+                           recently, so we don't have to check before we start
+                           it. */
+                        if (!ping_evictor_wake(exp))
+                                exp->exp_obd->obd_eviction_timer = 0;
+                }
+        }
+}
+
diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c

index 9277d2a..060e675 100644 (file)
--- a/lustre/obdclass/obd_config.c
+++ b/lustre/obdclass/obd_config.c
@@ -95,6 +95,7 @@ int class_attach(struct lustre_cfg *lcfg)
          cleanup_phase = 3;  /* class_release_dev */
  
          INIT_LIST_HEAD(&obd->obd_exports);
+        INIT_LIST_HEAD(&obd->obd_exports_timed);
          obd->obd_num_exports = 0;
          spin_lock_init(&obd->obd_dev_lock);
          spin_lock_init(&obd->obd_osfs_lock);
@@ -185,6 +186,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
          memcpy(&exp->exp_client_uuid, &obd->obd_uuid,
                 sizeof(exp->exp_client_uuid));
          obd->obd_self_export = exp;
+        list_del_init(&exp->exp_obd_chain_timed);
          class_export_put(exp);
  
          err = obd_setup(obd, sizeof(*lcfg), lcfg);
@@ -333,7 +335,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
                          GOTO(out, err = -EBUSY);
                  }
                  CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
-                       obd->obd_name, atomic_read(&obd->obd_refcount));
+                       obd->obd_name, atomic_read(&obd->obd_refcount) - 1);
                  dump_exports(obd);
                  class_disconnect_exports(obd);
          }
diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c

index 4dcf70e..4785a27 100644 (file)
--- a/lustre/obdfilter/filter.c
+++ b/lustre/obdfilter/filter.c
@@ -1349,6 +1349,8 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
                  lproc_filter_attach_seqstat(obd);
          }
  
+        ping_evictor_start();
+
          return rc;
  }
  
@@ -1378,6 +1380,8 @@ static int filter_cleanup(struct obd_device *obd)
                  }
          }
  
+        ping_evictor_stop();
+
          qctxt_cleanup(&filter->fo_quota_ctxt, 0);
  
          ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
@@ -1413,7 +1417,6 @@ static int filter_cleanup(struct obd_device *obd)
          //destroy_buffers(filter->fo_sb->s_dev);
          filter->fo_sb = NULL;
  
-
          ll_clear_rdonly(save_dev);
  
          if (must_relock)
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c

index 3a79e8a..df39056 100644 (file)
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -431,11 +431,14 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
  
          msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
  
+        /* All imports are pingable */
+        imp->imp_pingable = 1;
+        
          if (aa->pcaa_initial_connect) {
                  if (msg_flags & MSG_CONNECT_REPLAYABLE) {
                          CDEBUG(D_HA, "connected to replayable target: %s\n",
                                 imp->imp_target_uuid.uuid);
-                        imp->imp_pingable = imp->imp_replayable = 1;
+                        imp->imp_replayable = 1;
                  } else {
                          imp->imp_replayable = 0;
                  }
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index ee2257e..73a5e47 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -392,6 +392,8 @@ int ptl_send_rpc(struct ptlrpc_request *request)
          ptl_md_t         reply_md;
          ENTRY;
  
+        OBD_FAIL_RETURN(OBD_FAIL_PTLRPC_DROP_RPC, 0); 
+
          LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST);
  
          /* If this is a re-transmit, we're required to have disengaged
@@ -406,7 +408,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
                  request->rq_err = 1;
                  RETURN(-ENODEV);
          }
-
+        
          connection = request->rq_import->imp_connection;
  
          if (request->rq_bulk != NULL) {
diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c

index 05172e2..4b79c69 100644 (file)
--- a/lustre/ptlrpc/pinger.c
+++ b/lustre/ptlrpc/pinger.c
@@ -66,7 +66,7 @@ int ptlrpc_ping(struct obd_import *imp)
  
  static inline void ptlrpc_update_next_ping(struct obd_import *imp)
  {
-        imp->imp_next_ping = jiffies + obd_timeout * HZ;
+        imp->imp_next_ping = jiffies + PING_INTERVAL * HZ;
  }
  
  #ifdef __KERNEL__
@@ -99,7 +99,7 @@ static int ptlrpc_pinger_main(void *arg)
          while (1) {
                  unsigned long this_ping = jiffies;
                  long time_to_next_ping;
-                struct l_wait_info lwi = LWI_TIMEOUT(obd_timeout * HZ,
+                struct l_wait_info lwi = LWI_TIMEOUT(PING_INTERVAL * HZ,
                                                       NULL, NULL);
                  struct list_head *iter;
  
@@ -120,12 +120,15 @@ static int ptlrpc_pinger_main(void *arg)
                          spin_unlock_irqrestore(&imp->imp_lock, flags);
  
                          if (force ||
-                            time_after_eq(this_ping, imp->imp_next_ping)) {
+                            /* if the next ping is within, say, 5 jiffies from
+                               now, go ahead and ping. See note below. */
+                            time_after_eq(this_ping, imp->imp_next_ping - 5)) {
                                  if (level == LUSTRE_IMP_DISCON &&
                                      !imp->imp_deactive) {
                                          /* wait at least a timeout before
                                             trying recovery again. */
-                                        ptlrpc_update_next_ping(imp);
+                                        imp->imp_next_ping = jiffies + 
+                                                obd_timeout * HZ;
                                          ptlrpc_initiate_recovery(imp);
                                  }
                                  else if (level != LUSTRE_IMP_FULL ||
@@ -140,25 +143,32 @@ static int ptlrpc_pinger_main(void *arg)
                                          ptlrpc_ping(imp);
                                  }
  
-                        } else if (!imp->imp_pingable) {
-                                continue;
+                        } else {
+                                if (!imp->imp_pingable) 
+                                        continue;
+                                CDEBUG(D_HA, 
+                                       "don't need to ping %s (%lu > %lu)\n",
+                                       imp->imp_target_uuid.uuid,
+                                       imp->imp_next_ping, this_ping);
                          }
  
-                        CDEBUG(D_HA, "don't need to ping %s (%lu > %lu)\n",
-                               imp->imp_target_uuid.uuid,
-                               imp->imp_next_ping, this_ping);
-
                          /* obd_timeout might have changed */
                          if (time_after(imp->imp_next_ping,
-                                       this_ping + obd_timeout * HZ))
+                                       this_ping + PING_INTERVAL * HZ))
                                  ptlrpc_update_next_ping(imp);
                  }
                  up(&pinger_sem);
  
                  /* Wait until the next ping time, or until we're stopped. */
-                time_to_next_ping = this_ping + (obd_timeout * HZ) - jiffies;
+                time_to_next_ping = this_ping + (PING_INTERVAL * HZ) - jiffies;
+                /* The ping sent by ptlrpc_send_rpc may get sent out
+                   say .01 second after this.  
+                   ptlrpc_pinger_sending_on_import will then set the
+                   next ping time to next_ping + .01 sec, which means 
+                   we will SKIP the next ping at next_ping, and the
+                   ping will get sent 2 timeouts from now!  Beware. */
                  CDEBUG(D_HA, "next ping in %lu (%lu)\n", time_to_next_ping,
-                       this_ping + obd_timeout * HZ);
+                       this_ping + PING_INTERVAL * HZ);
                  if (time_to_next_ping > 0) {
                          lwi = LWI_TIMEOUT(time_to_next_ping, NULL, NULL);
                          l_wait_event(thread->t_ctl_waitq,
@@ -346,7 +356,7 @@ static int pinger_check_rpcs(void *arg)
                  int generation, level;
                  unsigned long flags;
  
-                if (time_after_eq(pd->pd_this_ping, imp->imp_next_ping)) {
+                if (time_after_eq(pd->pd_this_ping, imp->imp_next_ping - 5)) {
                          /* Add a ping. */
                          spin_lock_irqsave(&imp->imp_lock, flags);
                          generation = imp->imp_generation;
@@ -399,7 +409,7 @@ do_check_set:
          rc = ptlrpc_check_set(set);
  
          /* not finished, and we are not expired, simply return */
-        if (!rc && time_before(curtime, pd->pd_this_ping + obd_timeout * HZ)) {
+        if (!rc && time_before(curtime, pd->pd_this_ping + PING_INTERVAL * HZ)) {
                  CDEBUG(D_HA, "not finished, but also not expired\n");
                  pd->pd_recursion--;
                  return 0;
@@ -430,7 +440,7 @@ do_check_set:
          ptlrpc_set_destroy(set);
          pd->pd_set = NULL;
  
-        pd->pd_next_ping = pd->pd_this_ping + obd_timeout * HZ;
+        pd->pd_next_ping = pd->pd_this_ping + PING_INTERVAL * HZ;
          pd->pd_this_ping = 0; /* XXX for debug */
  
          CDEBUG(D_HA, "finished a round ping\n");
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c

index 829c078..1702e0b 100644 (file)
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -443,6 +443,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
          int                    rc;
          ENTRY;
  
+        LASSERT(svc);
+
          spin_lock_irqsave (&svc->srv_lock, flags);
          if (list_empty (&svc->srv_request_queue) ||
              (svc->srv_n_difficult_replies != 0 &&
@@ -494,17 +496,6 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
  
          CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid);
  
-        /* Discard requests queued for longer than my timeout.  If the
-         * client's timeout is similar to mine, she'll be timing out this
-         * REQ anyway (bug 1502) */
-        if (timediff / 1000000 > (long)obd_timeout) {
-                CERROR("Dropping timed-out opc %d request from %s"
-                       ": %ld seconds old\n", request->rq_reqmsg->opc,
-                       request->rq_peerstr,
-                       timediff / 1000000);
-                goto out;
-        }
-
          request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
  
          if (request->rq_export) {
@@ -527,7 +518,19 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
                          goto put_conn;
                  }
  
-                request->rq_export->exp_last_request_time = CURRENT_SECONDS;
+                class_update_export_timer(request->rq_export, 
+                                          (time_t)(timediff / 1000000));
+        }
+
+        /* Discard requests queued for longer than my timeout.  If the
+         * client's timeout is similar to mine, she'll be timing out this
+         * REQ anyway (bug 1502) */
+        if (timediff / 1000000 > (long)obd_timeout) {
+                CERROR("Dropping timed-out opc %d request from %s"
+                       ": %ld seconds old\n", request->rq_reqmsg->opc,
+                       request->rq_peerstr,
+                       timediff / 1000000);
+                goto put_conn;
          }
  
          request->rq_phase = RQ_PHASE_INTERPRET;
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 2980fdf..2c1fbd9 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -395,7 +395,33 @@ test_24() {        # bug 2248 - eviction fails writeback but app doesn't see it
  }
  run_test 24 "fsync error (should return error)" 
  
-test_25a() {
+test_26() {      # bug 5921 - evict dead exports 
+# this test can only run from a client on a separate node.
+       [ "`lsmod | grep obdfilter`" ] && \
+           echo "skipping test 26 (local OST)" && return
+       [ "`lsmod | grep mds`" ] && \
+           echo "skipping test 26 (local MDS)" && return
+       OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports
+        OST_EXP="`do_facet ost cat $OST_FILE`"
+       OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
+       echo starting with $OST_NEXP1 OST exports
+# OBD_FAIL_PTLRPC_DROP_RPC 0x505
+       do_facet client sysctl -w lustre.fail_loc=0x505
+       # evictor takes up to 2.25x to evict.  But if there's a 
+       # race to start the evictor from various obds, the loser
+       # might have to wait for the next ping.
+       echo Waiting for $(($TIMEOUT * 4)) secs
+       sleep $(($TIMEOUT * 4))
+        OST_EXP="`do_facet ost cat $OST_FILE`"
+       OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
+       echo ending with $OST_NEXP2 OST exports
+       do_facet client sysctl -w lustre.fail_loc=0x0
+        [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted"
+       return 0
+}
+run_test 26 "evict dead exports"
+
+test_50() {     # bug 4834 - failover under load failures
         mkdir -p $DIR/$tdir
         # put a load of file creates/writes/deletes for 10 min.
         do_facet client "writemany -q -a $DIR/$tdir/$tfile 600 5" &
@@ -415,9 +441,9 @@ test_25a() {
         echo writemany returned $rc
         return $rc
  }
-run_test 25a "failover MDS under load"
+run_test 50 "failover MDS under load"
  
-test_25b() {
+test_51() {
         mkdir -p $DIR/$tdir
         # put a load of file creates/writes/deletes
         do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" &
@@ -442,9 +468,9 @@ test_25b() {
         echo writemany returned $rc
         return $rc
  }
-run_test 25b "failover MDS during recovery"
+run_test 51 "failover MDS during recovery"
  
-test_25c_guts() {
+test_52_guts() {
         do_facet client "writemany -q $DIR/$tdir/$tfile 600 5" &
         CLIENT_PID=$!
         echo writemany pid $CLIENT_PID
@@ -461,22 +487,23 @@ test_25c_guts() {
         return $rc
  }
  
-test_25c() {
+test_52() {
         mkdir -p $DIR/$tdir
-       test_25c_guts
+       test_52_guts
         rc=$?
         [ $rc -ne 0 ] && { return $rc; }
         # wait for client to reconnect to OST
         sleep 30
-       test_25c_guts
+       test_52_guts
         rc=$?
         [ $rc -ne 0 ] && { return $rc; }
         sleep 30
-       test_25c_guts
+       test_52_guts
         rc=$?
         client_reconnect
         return $rc
  }
-run_test 25c "failover OST under load"
+run_test 52 "failover OST under load"
+
  
  FORCE=--force $CLEANUP
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 6173ecd..184b18c 100644 (file)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -151,13 +151,13 @@ client_df() {
  }
  
  client_reconnect() {
-    df $MOUNT > /dev/null
      uname -n >> $MOUNT/recon
      if [ ! -z "$CLIENTS" ]; then
         $PDSH $CLIENTS "df $MOUNT; uname -n >> $MOUNT/recon" > /dev/null
      fi
      echo Connected clients:
      cat $MOUNT/recon
+    ls -l $MOUNT/recon > /dev/null
      rm $MOUNT/recon
  }
author	nathan <nathan>
	Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)
committer	nathan <nathan>
	Fri, 22 Apr 2005 01:14:26 +0000 (01:14 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/include/linux/lustre_export.h		patch \| blob \| history
lustre/include/linux/obd.h		patch \| blob \| history
lustre/include/linux/obd_class.h		patch \| blob \| history
lustre/include/linux/obd_support.h		patch \| blob \| history
lustre/mds/handler.c		patch \| blob \| history
lustre/obdclass/class_obd.c		patch \| blob \| history
lustre/obdclass/genops.c		patch \| blob \| history
lustre/obdclass/obd_config.c		patch \| blob \| history
lustre/obdfilter/filter.c		patch \| blob \| history
lustre/ptlrpc/import.c		patch \| blob \| history
lustre/ptlrpc/niobuf.c		patch \| blob \| history
lustre/ptlrpc/pinger.c		patch \| blob \| history
lustre/ptlrpc/service.c		patch \| blob \| history
lustre/tests/recovery-small.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history