From bf878a3725eaab5bb84d8b5aa2ec68270b734c1b Mon Sep 17 00:00:00 2001 From: nathan Date: Fri, 22 Apr 2005 01:14:26 +0000 Subject: [PATCH] Branch b1_4 b=5921 r=adilger Land b1_4_5921 ping evictor --- lustre/ChangeLog | 1 + lustre/include/linux/lustre_export.h | 1 + lustre/include/linux/obd.h | 2 + lustre/include/linux/obd_class.h | 6 + lustre/include/linux/obd_support.h | 4 +- lustre/mds/handler.c | 4 + lustre/obdclass/class_obd.c | 4 + lustre/obdclass/genops.c | 243 +++++++++++++++++++++++++++++++++++ lustre/obdclass/obd_config.c | 4 +- lustre/obdfilter/filter.c | 5 +- lustre/ptlrpc/import.c | 5 +- lustre/ptlrpc/niobuf.c | 4 +- lustre/ptlrpc/pinger.c | 42 +++--- lustre/ptlrpc/service.c | 27 ++-- lustre/tests/recovery-small.sh | 47 +++++-- lustre/tests/test-framework.sh | 2 +- 16 files changed, 357 insertions(+), 44 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index ace07ea..42187af3 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -19,6 +19,7 @@ tbd Cluster File Systems, Inc. - don't hold i_size_sem in ll_nopage() and ll_ap_refresh_count (6077) - don't hold client locks on temporary worklist from l_lru (5666) - handle IO errors in 2.6 obdfilter bio completion routine (6046) + - automatically evict dead clients (5921) * miscellania - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs - allow --write-conf on an MDS with different nettype than client (5619) diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index 5136d66..d06af11 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -55,6 +55,7 @@ struct obd_export { atomic_t exp_refcount; struct obd_uuid exp_client_uuid; struct list_head exp_obd_chain; + struct list_head exp_obd_chain_timed; /* for ping evictor */ struct obd_device *exp_obd; struct obd_import *exp_imp_reverse; /* to make RPCs backwards */ struct ptlrpc_connection *exp_connection; diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 4dca21b..3715578 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -525,6 +525,8 @@ struct obd_device { struct llog_ctxt *obd_llog_ctxt[LLOG_MAX_CTXTS]; struct obd_device *obd_observer; struct obd_export *obd_self_export; + struct list_head obd_exports_timed; /* for ping evictor */ + time_t obd_eviction_timer; /* for ping evictor */ /* XXX encapsulate all this recovery data into one struct */ svc_handler_t obd_recovery_handler; diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index e7d1afd..0a12a08 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -130,6 +130,7 @@ do { \ void __class_export_put(struct obd_export *); struct obd_export *class_new_export(struct obd_device *obddev); void class_unlink_export(struct obd_export *exp); +void class_update_export_timer(struct obd_export *exp, time_t extra_delay); struct obd_import *class_import_get(struct obd_import *); void class_import_put(struct obd_import *); @@ -159,6 +160,11 @@ void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid); int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare); void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj); +/* ping evictor */ +void ping_evictor_start(void); +void ping_evictor_stop(void); + + #define OBT(dev) (dev)->obd_type #define OBP(dev, op) (dev)->obd_type->typ_ops->o_ ## op #define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 2c54309..c45aa38 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -37,7 +37,8 @@ extern atomic_t obd_memory; extern int obd_memmax; extern unsigned int obd_fail_loc; extern unsigned int obd_dump_on_timeout; -extern unsigned int obd_timeout; +extern unsigned int obd_timeout; /* seconds */ +#define PING_INTERVAL (obd_timeout / 4) extern unsigned int ldlm_timeout; extern char obd_lustre_upcall[128]; extern unsigned int obd_sync_filter; @@ -146,6 +147,7 @@ extern wait_queue_head_t obd_race_waitq; #define OBD_FAIL_PTLRPC_RQBD 0x502 #define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 +#define OBD_FAIL_PTLRPC_DROP_RPC 0x505 #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 42367a2..f7739bc 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1763,6 +1763,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) obd->obd_replayable ? "enabled" : "disabled"); } + ping_evictor_start(); + sema_init(&mds->mds_quota_info.qi_sem, 1); rc = qctxt_init(&mds->mds_quota_ctxt, mds->mds_sb, dqacq_handler); if (rc) { @@ -1951,6 +1953,8 @@ static int mds_cleanup(struct obd_device *obd) int must_relock = 0; ENTRY; + ping_evictor_stop(); + if (mds->mds_sb == NULL) RETURN(0); save_dev = ll_sbdev(mds->mds_sb); diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 201bba5..4d01896 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -444,6 +444,7 @@ EXPORT_SYMBOL(class_conn2cliimp); EXPORT_SYMBOL(class_disconnect); EXPORT_SYMBOL(class_disconnect_exports); EXPORT_SYMBOL(class_disconnect_stale_exports); +EXPORT_SYMBOL(class_update_export_timer); EXPORT_SYMBOL(oig_init); EXPORT_SYMBOL(oig_release); @@ -451,6 +452,9 @@ EXPORT_SYMBOL(oig_add_one); EXPORT_SYMBOL(oig_wait); EXPORT_SYMBOL(oig_complete_one); +EXPORT_SYMBOL(ping_evictor_start); +EXPORT_SYMBOL(ping_evictor_stop); + /* uuid.c */ EXPORT_SYMBOL(class_uuid_unparse); EXPORT_SYMBOL(lustre_uuid_to_peer); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index db8500f..83de073 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -550,14 +550,18 @@ struct obd_export *class_new_export(struct obd_device *obd) INIT_LIST_HEAD(&export->exp_handle.h_link); class_handle_hash(&export->exp_handle, export_handle_addref); + export->exp_last_request_time = CURRENT_SECONDS; spin_lock_init(&export->exp_lock); spin_lock(&obd->obd_dev_lock); LASSERT(!obd->obd_stopping); /* shouldn't happen, but might race */ atomic_inc(&obd->obd_refcount); list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); + list_add_tail(&export->exp_obd_chain_timed, + &export->exp_obd->obd_exports_timed); export->exp_obd->obd_num_exports++; spin_unlock(&obd->obd_dev_lock); + obd_init_export(export); return export; } @@ -568,6 +572,7 @@ void class_unlink_export(struct obd_export *exp) spin_lock(&exp->exp_obd->obd_dev_lock); list_del_init(&exp->exp_obd_chain); + list_del_init(&exp->exp_obd_chain_timed); exp->exp_obd->obd_num_exports--; spin_unlock(&exp->exp_obd->obd_dev_lock); @@ -948,3 +953,241 @@ int oig_wait(struct obd_io_group *oig) CDEBUG(D_CACHE, "done waiting on oig %p rc %d\n", oig, oig->oig_rc); return oig->oig_rc; } + + +/* Ping evictor thread */ +#define PET_READY 1 +#define PET_TERMINATE 2 +#define D_PET D_HA + +static int pet_refcount = 0; +static int pet_state; +static wait_queue_head_t pet_waitq; +static struct obd_export *pet_exp = NULL; +static spinlock_t pet_lock = SPIN_LOCK_UNLOCKED; + +static int ping_evictor_wake(struct obd_export *exp) +{ +#ifdef __KERNEL__ + spin_lock(&pet_lock); + if (pet_exp) { + /* eventually the new obd will call here again. */ + spin_unlock(&pet_lock); + return 1; + } + pet_exp = exp; + spin_unlock(&pet_lock); + + /* We have to make sure the obd isn't destroyed between now and when + the ping evictor runs. We'll take a reference here, and drop it + when we finish in the evictor. We don't really care about this + export in particular; we just need one to keep the obd. */ + class_export_get(pet_exp); + wake_up(&pet_waitq); +#endif + return 0; +} + +#ifdef __KERNEL__ +/* Same as ptlrpc_fail_export, but this module must load first... */ +void ping_evictor_fail_export(struct obd_export *exp) +{ + int rc, already_failed; + unsigned long flags; + + spin_lock_irqsave(&exp->exp_lock, flags); + already_failed = exp->exp_failed; + exp->exp_failed = 1; + spin_unlock_irqrestore(&exp->exp_lock, flags); + + if (already_failed) { + CDEBUG(D_PET, "disconnecting dead export %p/%s; skipping\n", + exp, exp->exp_client_uuid.uuid); + return; + } + + CDEBUG(D_PET, "disconnecting export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + /* Most callers into obd_disconnect are removing their own reference + * (request, for example) in addition to the one from the hash table. + * We don't have such a reference here, so make one. */ + class_export_get(exp); + rc = obd_disconnect(exp); + if (rc) + CERROR("disconnecting export %p failed: %d\n", exp, rc); + CERROR("disconnected export %p/%s\n", + exp, exp->exp_client_uuid.uuid); +} + +static int ping_evictor_main(void *arg) +{ + struct list_head *pos, *n; + struct obd_device *obd; + struct obd_export *exp; + struct l_wait_info lwi = { 0 }; + time_t expire_time; + unsigned long flags; + ENTRY; + + lock_kernel(); + kportal_daemonize("ping_evictor"); + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + unlock_kernel(); + + CDEBUG(D_PET, "Starting Ping Evictor\n"); + pet_exp = NULL; + pet_state = PET_READY; + while (1) { + l_wait_event(pet_waitq, pet_exp || + (pet_state == PET_TERMINATE), &lwi); + if (pet_state == PET_TERMINATE) + break; + + obd = pet_exp->exp_obd; + expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2); + + CDEBUG(D_PET, "evicting all exports of obd %s older than %ld\n", + obd->obd_name, expire_time); + + /* Exports can't be deleted out of the list, which means we + can't lose the last ref on the export, while we hold the obd + lock (class_unlink_export). If they've already been + removed from the list, we won't find them here. */ + spin_lock(&obd->obd_dev_lock); + list_for_each_safe(pos, n, &obd->obd_exports_timed) { + int stop = 0; + exp = list_entry(pos, struct obd_export, + exp_obd_chain_timed); + class_export_get(exp); + spin_unlock(&obd->obd_dev_lock); + + if (expire_time > exp->exp_last_request_time) { + LCONSOLE_WARN("%s hasn't heard from %s in %ld " + "seconds. I think it's dead, " + "and I am evicting it.\n", + obd->obd_name, + exp->exp_client_uuid.uuid, + (long)(CURRENT_SECONDS - + exp->exp_last_request_time)); + ping_evictor_fail_export(exp); + } else { + /* List is sorted, so everyone below is ok */ + stop++; + } + class_export_put(exp); + /* lock again for the next entry */ + spin_lock(&obd->obd_dev_lock); + + if (stop) + break; + } + spin_unlock(&obd->obd_dev_lock); + class_export_put(pet_exp); + pet_exp = NULL; + } + CDEBUG(D_PET, "Exiting Ping Evictor\n"); + + RETURN(0); +} +#endif + +void ping_evictor_start(void) +{ +#ifdef __KERNEL__ + int rc; + + if (++pet_refcount > 1) + return; + + init_waitqueue_head(&pet_waitq); + + rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS); + if (rc < 0) { + pet_refcount--; + CERROR("Cannot start ping evictor thread: %d\n", rc); + } +#endif +} + +void ping_evictor_stop(void) +{ +#ifdef __KERNEL__ + if (--pet_refcount > 0) + return; + + pet_state = PET_TERMINATE; + wake_up(&pet_waitq); +#endif +} + +/* This function makes sure dead exports are evicted in a timely manner. + This function is only called when some export receives a message (i.e., + the network is up.) */ +void class_update_export_timer(struct obd_export *exp, time_t extra_delay) +{ + LASSERT(exp); + + /* Compensate for slow machines, etc, by faking our request time + into the future. Although this can break the strict time-ordering + of the list, we can be really lazy here - we don't have to evict + at the exact right moment. Eventually, all silent exports + will make it to the top of the list. */ + exp->exp_last_request_time = max(exp->exp_last_request_time, + (time_t)CURRENT_SECONDS + extra_delay); + + CDEBUG(D_PET, "updating export %s at %ld\n", + exp->exp_client_uuid.uuid, + exp->exp_last_request_time); + + /* exports may get disconnected from the chain even though the + export has references, so we must keep the spin lock while + manipulating the lists */ + spin_lock(&exp->exp_obd->obd_dev_lock); + + if (list_empty(&exp->exp_obd_chain_timed)) { + /* this one is not timed */ + spin_unlock(&exp->exp_obd->obd_dev_lock); + return; + } + + list_move_tail(&exp->exp_obd_chain_timed, + &exp->exp_obd->obd_exports_timed); + + /* Note - racing to start/reset the obd_eviction timer is safe */ + if (exp->exp_obd->obd_eviction_timer == 0) { + struct obd_export *oldest_exp; + /* Check if the oldest entry is expired. */ + oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, + struct obd_export, exp_obd_chain_timed); + spin_unlock(&exp->exp_obd->obd_dev_lock); + + if (CURRENT_SECONDS > (oldest_exp->exp_last_request_time + + (3 * obd_timeout / 2) + extra_delay)) { + /* We need a second timer, in case the net was + down and it just came back. Since the pinger + may skip every other PING_INTERVAL (see note in + ptlrpc_pinger_main), we better wait for 3. */ + exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS + + 3 * PING_INTERVAL; + CDEBUG(D_PET, + "Thinking about evicting old export %s at %ld\n", + oldest_exp->exp_client_uuid.uuid, + oldest_exp->exp_last_request_time); + } + } else { + spin_unlock(&exp->exp_obd->obd_dev_lock); + if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer + + extra_delay)) { + /* The evictor won't evict anyone who we've heard from + recently, so we don't have to check before we start + it. */ + if (!ping_evictor_wake(exp)) + exp->exp_obd->obd_eviction_timer = 0; + } + } +} + diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 9277d2a..060e675 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -95,6 +95,7 @@ int class_attach(struct lustre_cfg *lcfg) cleanup_phase = 3; /* class_release_dev */ INIT_LIST_HEAD(&obd->obd_exports); + INIT_LIST_HEAD(&obd->obd_exports_timed); obd->obd_num_exports = 0; spin_lock_init(&obd->obd_dev_lock); spin_lock_init(&obd->obd_osfs_lock); @@ -185,6 +186,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) memcpy(&exp->exp_client_uuid, &obd->obd_uuid, sizeof(exp->exp_client_uuid)); obd->obd_self_export = exp; + list_del_init(&exp->exp_obd_chain_timed); class_export_put(exp); err = obd_setup(obd, sizeof(*lcfg), lcfg); @@ -333,7 +335,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) GOTO(out, err = -EBUSY); } CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n", - obd->obd_name, atomic_read(&obd->obd_refcount)); + obd->obd_name, atomic_read(&obd->obd_refcount) - 1); dump_exports(obd); class_disconnect_exports(obd); } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 4dcf70e..4785a27 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1349,6 +1349,8 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf) lproc_filter_attach_seqstat(obd); } + ping_evictor_start(); + return rc; } @@ -1378,6 +1380,8 @@ static int filter_cleanup(struct obd_device *obd) } } + ping_evictor_stop(); + qctxt_cleanup(&filter->fo_quota_ctxt, 0); ldlm_namespace_free(obd->obd_namespace, obd->obd_force); @@ -1413,7 +1417,6 @@ static int filter_cleanup(struct obd_device *obd) //destroy_buffers(filter->fo_sb->s_dev); filter->fo_sb = NULL; - ll_clear_rdonly(save_dev); if (must_relock) diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 3a79e8a..df39056 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -431,11 +431,14 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); + /* All imports are pingable */ + imp->imp_pingable = 1; + if (aa->pcaa_initial_connect) { if (msg_flags & MSG_CONNECT_REPLAYABLE) { CDEBUG(D_HA, "connected to replayable target: %s\n", imp->imp_target_uuid.uuid); - imp->imp_pingable = imp->imp_replayable = 1; + imp->imp_replayable = 1; } else { imp->imp_replayable = 0; } diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index ee2257e..73a5e47 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -392,6 +392,8 @@ int ptl_send_rpc(struct ptlrpc_request *request) ptl_md_t reply_md; ENTRY; + OBD_FAIL_RETURN(OBD_FAIL_PTLRPC_DROP_RPC, 0); + LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST); /* If this is a re-transmit, we're required to have disengaged @@ -406,7 +408,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_err = 1; RETURN(-ENODEV); } - + connection = request->rq_import->imp_connection; if (request->rq_bulk != NULL) { diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 05172e2..4b79c69 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -66,7 +66,7 @@ int ptlrpc_ping(struct obd_import *imp) static inline void ptlrpc_update_next_ping(struct obd_import *imp) { - imp->imp_next_ping = jiffies + obd_timeout * HZ; + imp->imp_next_ping = jiffies + PING_INTERVAL * HZ; } #ifdef __KERNEL__ @@ -99,7 +99,7 @@ static int ptlrpc_pinger_main(void *arg) while (1) { unsigned long this_ping = jiffies; long time_to_next_ping; - struct l_wait_info lwi = LWI_TIMEOUT(obd_timeout * HZ, + struct l_wait_info lwi = LWI_TIMEOUT(PING_INTERVAL * HZ, NULL, NULL); struct list_head *iter; @@ -120,12 +120,15 @@ static int ptlrpc_pinger_main(void *arg) spin_unlock_irqrestore(&imp->imp_lock, flags); if (force || - time_after_eq(this_ping, imp->imp_next_ping)) { + /* if the next ping is within, say, 5 jiffies from + now, go ahead and ping. See note below. */ + time_after_eq(this_ping, imp->imp_next_ping - 5)) { if (level == LUSTRE_IMP_DISCON && !imp->imp_deactive) { /* wait at least a timeout before trying recovery again. */ - ptlrpc_update_next_ping(imp); + imp->imp_next_ping = jiffies + + obd_timeout * HZ; ptlrpc_initiate_recovery(imp); } else if (level != LUSTRE_IMP_FULL || @@ -140,25 +143,32 @@ static int ptlrpc_pinger_main(void *arg) ptlrpc_ping(imp); } - } else if (!imp->imp_pingable) { - continue; + } else { + if (!imp->imp_pingable) + continue; + CDEBUG(D_HA, + "don't need to ping %s (%lu > %lu)\n", + imp->imp_target_uuid.uuid, + imp->imp_next_ping, this_ping); } - CDEBUG(D_HA, "don't need to ping %s (%lu > %lu)\n", - imp->imp_target_uuid.uuid, - imp->imp_next_ping, this_ping); - /* obd_timeout might have changed */ if (time_after(imp->imp_next_ping, - this_ping + obd_timeout * HZ)) + this_ping + PING_INTERVAL * HZ)) ptlrpc_update_next_ping(imp); } up(&pinger_sem); /* Wait until the next ping time, or until we're stopped. */ - time_to_next_ping = this_ping + (obd_timeout * HZ) - jiffies; + time_to_next_ping = this_ping + (PING_INTERVAL * HZ) - jiffies; + /* The ping sent by ptlrpc_send_rpc may get sent out + say .01 second after this. + ptlrpc_pinger_sending_on_import will then set the + next ping time to next_ping + .01 sec, which means + we will SKIP the next ping at next_ping, and the + ping will get sent 2 timeouts from now! Beware. */ CDEBUG(D_HA, "next ping in %lu (%lu)\n", time_to_next_ping, - this_ping + obd_timeout * HZ); + this_ping + PING_INTERVAL * HZ); if (time_to_next_ping > 0) { lwi = LWI_TIMEOUT(time_to_next_ping, NULL, NULL); l_wait_event(thread->t_ctl_waitq, @@ -346,7 +356,7 @@ static int pinger_check_rpcs(void *arg) int generation, level; unsigned long flags; - if (time_after_eq(pd->pd_this_ping, imp->imp_next_ping)) { + if (time_after_eq(pd->pd_this_ping, imp->imp_next_ping - 5)) { /* Add a ping. */ spin_lock_irqsave(&imp->imp_lock, flags); generation = imp->imp_generation; @@ -399,7 +409,7 @@ do_check_set: rc = ptlrpc_check_set(set); /* not finished, and we are not expired, simply return */ - if (!rc && time_before(curtime, pd->pd_this_ping + obd_timeout * HZ)) { + if (!rc && time_before(curtime, pd->pd_this_ping + PING_INTERVAL * HZ)) { CDEBUG(D_HA, "not finished, but also not expired\n"); pd->pd_recursion--; return 0; @@ -430,7 +440,7 @@ do_check_set: ptlrpc_set_destroy(set); pd->pd_set = NULL; - pd->pd_next_ping = pd->pd_this_ping + obd_timeout * HZ; + pd->pd_next_ping = pd->pd_this_ping + PING_INTERVAL * HZ; pd->pd_this_ping = 0; /* XXX for debug */ CDEBUG(D_HA, "finished a round ping\n"); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 829c078..1702e0b 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -443,6 +443,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) int rc; ENTRY; + LASSERT(svc); + spin_lock_irqsave (&svc->srv_lock, flags); if (list_empty (&svc->srv_request_queue) || (svc->srv_n_difficult_replies != 0 && @@ -494,17 +496,6 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid); - /* Discard requests queued for longer than my timeout. If the - * client's timeout is similar to mine, she'll be timing out this - * REQ anyway (bug 1502) */ - if (timediff / 1000000 > (long)obd_timeout) { - CERROR("Dropping timed-out opc %d request from %s" - ": %ld seconds old\n", request->rq_reqmsg->opc, - request->rq_peerstr, - timediff / 1000000); - goto out; - } - request->rq_export = class_conn2export(&request->rq_reqmsg->handle); if (request->rq_export) { @@ -527,7 +518,19 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) goto put_conn; } - request->rq_export->exp_last_request_time = CURRENT_SECONDS; + class_update_export_timer(request->rq_export, + (time_t)(timediff / 1000000)); + } + + /* Discard requests queued for longer than my timeout. If the + * client's timeout is similar to mine, she'll be timing out this + * REQ anyway (bug 1502) */ + if (timediff / 1000000 > (long)obd_timeout) { + CERROR("Dropping timed-out opc %d request from %s" + ": %ld seconds old\n", request->rq_reqmsg->opc, + request->rq_peerstr, + timediff / 1000000); + goto put_conn; } request->rq_phase = RQ_PHASE_INTERPRET; diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 2980fdf..2c1fbd9 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -395,7 +395,33 @@ test_24() { # bug 2248 - eviction fails writeback but app doesn't see it } run_test 24 "fsync error (should return error)" -test_25a() { +test_26() { # bug 5921 - evict dead exports +# this test can only run from a client on a separate node. + [ "`lsmod | grep obdfilter`" ] && \ + echo "skipping test 26 (local OST)" && return + [ "`lsmod | grep mds`" ] && \ + echo "skipping test 26 (local MDS)" && return + OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports + OST_EXP="`do_facet ost cat $OST_FILE`" + OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2` + echo starting with $OST_NEXP1 OST exports +# OBD_FAIL_PTLRPC_DROP_RPC 0x505 + do_facet client sysctl -w lustre.fail_loc=0x505 + # evictor takes up to 2.25x to evict. But if there's a + # race to start the evictor from various obds, the loser + # might have to wait for the next ping. + echo Waiting for $(($TIMEOUT * 4)) secs + sleep $(($TIMEOUT * 4)) + OST_EXP="`do_facet ost cat $OST_FILE`" + OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2` + echo ending with $OST_NEXP2 OST exports + do_facet client sysctl -w lustre.fail_loc=0x0 + [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted" + return 0 +} +run_test 26 "evict dead exports" + +test_50() { # bug 4834 - failover under load failures mkdir -p $DIR/$tdir # put a load of file creates/writes/deletes for 10 min. do_facet client "writemany -q -a $DIR/$tdir/$tfile 600 5" & @@ -415,9 +441,9 @@ test_25a() { echo writemany returned $rc return $rc } -run_test 25a "failover MDS under load" +run_test 50 "failover MDS under load" -test_25b() { +test_51() { mkdir -p $DIR/$tdir # put a load of file creates/writes/deletes do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" & @@ -442,9 +468,9 @@ test_25b() { echo writemany returned $rc return $rc } -run_test 25b "failover MDS during recovery" +run_test 51 "failover MDS during recovery" -test_25c_guts() { +test_52_guts() { do_facet client "writemany -q $DIR/$tdir/$tfile 600 5" & CLIENT_PID=$! echo writemany pid $CLIENT_PID @@ -461,22 +487,23 @@ test_25c_guts() { return $rc } -test_25c() { +test_52() { mkdir -p $DIR/$tdir - test_25c_guts + test_52_guts rc=$? [ $rc -ne 0 ] && { return $rc; } # wait for client to reconnect to OST sleep 30 - test_25c_guts + test_52_guts rc=$? [ $rc -ne 0 ] && { return $rc; } sleep 30 - test_25c_guts + test_52_guts rc=$? client_reconnect return $rc } -run_test 25c "failover OST under load" +run_test 52 "failover OST under load" + FORCE=--force $CLEANUP diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 6173ecd..184b18c 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -151,13 +151,13 @@ client_df() { } client_reconnect() { - df $MOUNT > /dev/null uname -n >> $MOUNT/recon if [ ! -z "$CLIENTS" ]; then $PDSH $CLIENTS "df $MOUNT; uname -n >> $MOUNT/recon" > /dev/null fi echo Connected clients: cat $MOUNT/recon + ls -l $MOUNT/recon > /dev/null rm $MOUNT/recon } -- 1.8.3.1