From f8aafa8913a997830956e6149e908edffa1f0d8d Mon Sep 17 00:00:00 2001 From: pjkirner Date: Tue, 12 Jul 2005 13:24:45 +0000 Subject: [PATCH] b=1693 r=adilger Landing "Provide a healht-check routine for MDS and OSTs --- lustre/ChangeLog | 6 ++++++ lustre/include/linux/lustre_net.h | 1 + lustre/include/linux/obd.h | 8 ++++++++ lustre/include/linux/obd_class.h | 25 ++++++++++++++++++++++++ lustre/include/linux/obd_support.h | 1 + lustre/mds/handler.c | 33 +++++++++++++++++++++++++++++++ lustre/obdclass/class_obd.c | 29 ++++++++++++++++++++++++--- lustre/obdclass/lprocfs_status.c | 1 + lustre/obdfilter/filter.c | 16 +++++++++++++++ lustre/ost/ost_handler.c | 29 +++++++++++++++++++++++++++ lustre/ptlrpc/ptlrpc_module.c | 1 + lustre/ptlrpc/service.c | 40 ++++++++++++++++++++++++++++++++++++++ 12 files changed, 187 insertions(+), 3 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 1f3145d..4200c89 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -111,6 +111,12 @@ Description: Failover mode is now the default for OSTs. Details : By default, OSTs will now run in failover mode. To return to the old behaviour, add '--failout' to the lmc line for OSTs. +Severity : enhancement +Bugzilla : 1693 +Description: Health checks are now provided for MDS and OSTs +Details : Additional detailed health check information on MSD and OSTs + is now provided through the procfs health_check value. + ------------------------------------------------------------------------------ 2005-06-20 Cluster File Systems, Inc. diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 3e44c52..658e6e0 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -726,6 +726,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, int ptlrpc_unregister_service(struct ptlrpc_service *service); int liblustre_check_services (void *arg); void ptlrpc_daemonize(void); +int ptlrpc_service_health_check(struct ptlrpc_service *); struct ptlrpc_svc_data { diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 51f1910..823b24e 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -368,6 +368,7 @@ struct mds_obd { struct lustre_quota_info mds_quota_info; struct lustre_quota_ctxt mds_quota_ctxt; atomic_t mds_quotachecking; + struct semaphore mds_health_sem; }; struct echo_obd { @@ -409,6 +410,7 @@ struct recovd_obd { struct ost_obd { struct ptlrpc_service *ost_service; struct ptlrpc_service *ost_create_service; + struct semaphore ost_health_sem; }; struct echo_client_obd { @@ -737,6 +739,8 @@ struct obd_ops { int (*o_notify)(struct obd_device *obd, struct obd_device *watched, int active); + int (*o_health_check)(struct obd_device *); + /* quota methods */ int (*o_quotacheck)(struct obd_export *, struct obd_quotactl *); int (*o_quotactl)(struct obd_export *, struct obd_quotactl *); @@ -745,6 +749,10 @@ struct obd_ops { * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. * Also, add a wrapper function in include/linux/obd_class.h. + * + * Also note that if you add it to the END, you also have to change + * the num_stats calculation. + * */ }; diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index c24c01b..fe2b2c0 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -1046,6 +1046,31 @@ static inline int obd_quotactl(struct obd_export *exp, RETURN(rc); } +static inline int obd_health_check(struct obd_device *obd) +{ + /* returns: 0 on healthy + * >0 on unhealthy + reason code/flag + * however the only suppored reason == 1 right now + * We'll need to define some better reasons + * or flags in the future. + * <0 on error + */ + int rc; + ENTRY; + + /* don't use EXP_CHECK_OP, because NULL method is normal here */ + if (obd == NULL || !OBT(obd)) { + CERROR("cleaned up obd\n"); + RETURN(-EOPNOTSUPP); + } + if (!obd->obd_set_up || obd->obd_stopping) + RETURN(0); + if (!OBP(obd, health_check)) + RETURN(0); + + rc = OBP(obd, health_check)(obd); + RETURN(rc); +} static inline int obd_register_observer(struct obd_device *obd, struct obd_device *observer) diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 86c87bd..554461b 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -40,6 +40,7 @@ extern unsigned int obd_dump_on_timeout; extern unsigned int obd_timeout; /* seconds */ #define PING_INTERVAL max(obd_timeout / 4, 1U) extern unsigned int ldlm_timeout; +extern unsigned int obd_health_check_timeout; extern char obd_lustre_upcall[128]; extern unsigned int obd_sync_filter; extern wait_queue_head_t obd_race_waitq; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index c1c72f3..5ee8956 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -2047,6 +2047,8 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) lprocfs_init_vars(mdt, &lvars); lprocfs_obd_setup(obd, lvars.obd_vars); + + sema_init(&mds->mds_health_sem, 1); mds->mds_service = ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, @@ -2101,10 +2103,13 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) err_thread3: ptlrpc_unregister_service(mds->mds_readpage_service); + mds->mds_readpage_service = NULL; err_thread2: ptlrpc_unregister_service(mds->mds_setattr_service); + mds->mds_setattr_service = NULL; err_thread: ptlrpc_unregister_service(mds->mds_service); + mds->mds_service = NULL; err_lprocfs: lprocfs_obd_cleanup(obd); return rc; @@ -2115,15 +2120,42 @@ static int mdt_cleanup(struct obd_device *obd) struct mds_obd *mds = &obd->u.mds; ENTRY; + down(&mds->mds_health_sem); ptlrpc_unregister_service(mds->mds_readpage_service); ptlrpc_unregister_service(mds->mds_setattr_service); ptlrpc_unregister_service(mds->mds_service); + mds->mds_readpage_service = NULL; + mds->mds_setattr_service = NULL; + mds->mds_service = NULL; + up(&mds->mds_health_sem); lprocfs_obd_cleanup(obd); RETURN(0); } +static int mdt_health_check(struct obd_device *obd) +{ + struct mds_obd *mds = &obd->u.mds; + int rc = 0; + + down(&mds->mds_health_sem); + rc |= ptlrpc_service_health_check(mds->mds_readpage_service); + rc |= ptlrpc_service_health_check(mds->mds_setattr_service); + rc |= ptlrpc_service_health_check(mds->mds_service); + up(&mds->mds_health_sem); + + /* + * health_check to return 0 on healthy + * and 1 on unhealthy. + */ + if(rc != 0) + rc = 1; + + return rc; +} + + static struct dentry *mds_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr, void *data) { @@ -2162,6 +2194,7 @@ static struct obd_ops mdt_obd_ops = { .o_owner = THIS_MODULE, .o_setup = mdt_setup, .o_cleanup = mdt_cleanup, + .o_health_check = mdt_health_check, }; static int __init mds_init(void) diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 4f314e4..e8ceae7 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -90,6 +90,7 @@ unsigned int obd_fail_loc; unsigned int obd_dump_on_timeout; unsigned int obd_timeout = 100; /* seconds */ unsigned int ldlm_timeout = 20; /* seconds */ +unsigned int obd_health_check_timeout = 120; /* seconds */ char obd_lustre_upcall[128] = "DEFAULT"; /* or NONE or /full/path/to/upcall */ unsigned int obd_sync_filter; /* = 0, don't sync by default */ @@ -379,6 +380,7 @@ EXPORT_SYMBOL(obd_race_waitq); EXPORT_SYMBOL(obd_dump_on_timeout); EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(ldlm_timeout); +EXPORT_SYMBOL(obd_health_check_timeout); EXPORT_SYMBOL(obd_lustre_upcall); EXPORT_SYMBOL(obd_sync_filter); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); @@ -455,13 +457,12 @@ int obd_proc_read_pinger(char *page, char **start, off_t off, int count, static int obd_proc_read_health(char *page, char **start, off_t off, int count, int *eof, void *data) { - int rc = 0; //, i; + int rc = 0 , i; *eof = 1; if (portals_catastrophe) rc += snprintf(page + rc, count - rc, "LBUG\n"); -#if 0 spin_lock(&obd_dev_lock); for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd; @@ -482,7 +483,6 @@ static int obd_proc_read_health(char *page, char **start, off_t off, spin_lock(&obd_dev_lock); } spin_unlock(&obd_dev_lock); -#endif if (rc == 0) return snprintf(page, count, "healthy\n"); @@ -491,12 +491,35 @@ static int obd_proc_read_health(char *page, char **start, off_t off, return rc; } +static int obd_proc_rd_health_timeout(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + *eof = 1; + return snprintf(page, count, "%d\n", obd_health_check_timeout); +} + +static int obd_proc_wr_health_timeout(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd_health_check_timeout = val; + + return count; +} + /* Root for /proc/fs/lustre */ struct lprocfs_vars lprocfs_base[] = { { "version", obd_proc_read_version, NULL, NULL }, { "kernel_version", obd_proc_read_kernel_version, NULL, NULL }, { "pinger", obd_proc_read_pinger, NULL, NULL }, { "health_check", obd_proc_read_health, NULL, NULL }, + { "health_check_timeout", obd_proc_rd_health_timeout, + obd_proc_wr_health_timeout, NULL }, { 0 } }; #else diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 8a0db22..a3fcfc0 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -671,6 +671,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify); LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck); LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check); for (i = num_private_stats; i < num_stats; i++) { /* If this LBUGs, it is likely that an obd diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 3f26239..fadf1bd 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2722,6 +2722,21 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp, RETURN(0); } +static int filter_health_check(struct obd_device *obd) +{ + struct filter_obd *filter = &obd->u.filter; + int rc = 0; + + /* + * health_check to return 0 on healthy + * and 1 on unhealthy. + */ + if(filter->fo_sb->s_flags & MS_RDONLY) + rc = 1; + + return rc; +} + static struct dentry *filter_lvfs_fid2dentry(__u64 id, __u32 gen, __u64 gr, void *data) { @@ -2758,6 +2773,7 @@ static struct obd_ops filter_obd_ops = { .o_iocontrol = filter_iocontrol, .o_quotacheck = filter_quotacheck, .o_quotactl = filter_quotactl, + .o_health_check = filter_health_check, }; static struct obd_ops filter_sanobd_ops = { diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 9ca7b6d..025beaa 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1294,6 +1294,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) lprocfs_init_vars(ost, &lvars); lprocfs_obd_setup(obd, lvars.obd_vars); + sema_init(&ost->ost_health_sem, 1); + ost->ost_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, @@ -1330,8 +1332,10 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) out_create: ptlrpc_unregister_service(ost->ost_create_service); + ost->ost_create_service = NULL; out_service: ptlrpc_unregister_service(ost->ost_service); + ost->ost_service = NULL; out_lprocfs: lprocfs_obd_cleanup(obd); RETURN(rc); @@ -1350,14 +1354,38 @@ static int ost_cleanup(struct obd_device *obd) } spin_unlock_bh(&obd->obd_processing_task_lock); + down(&ost->ost_health_sem); ptlrpc_unregister_service(ost->ost_service); ptlrpc_unregister_service(ost->ost_create_service); + ost->ost_service = NULL; + ost->ost_create_service = NULL; + up(&ost->ost_health_sem); lprocfs_obd_cleanup(obd); RETURN(err); } +static int ost_health_check(struct obd_device *obd) +{ + struct ost_obd *ost = &obd->u.ost; + int rc = 0; + + down(&ost->ost_health_sem); + rc |= ptlrpc_service_health_check(ost->ost_service); + rc |= ptlrpc_service_health_check(ost->ost_create_service); + up(&ost->ost_health_sem); + + /* + * health_check to return 0 on healthy + * and 1 on unhealthy. + */ + if( rc != 0) + rc = 1; + + return rc; +} + struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r) { return (struct ost_thread_local_cache *)(r->rq_svc_thread->t_data); @@ -1368,6 +1396,7 @@ static struct obd_ops ost_obd_ops = { .o_owner = THIS_MODULE, .o_setup = ost_setup, .o_cleanup = ost_cleanup, + .o_health_check = ost_health_check, }; static int __init ost_init(void) diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index f9c4bdb..3ca1cee 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -135,6 +135,7 @@ EXPORT_SYMBOL(ptlrpc_start_n_threads); EXPORT_SYMBOL(ptlrpc_start_thread); EXPORT_SYMBOL(ptlrpc_unregister_service); EXPORT_SYMBOL(ptlrpc_daemonize); +EXPORT_SYMBOL(ptlrpc_service_health_check); /* pack_generic.c */ EXPORT_SYMBOL(lustre_msg_swabbed); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index e54449d..14d99b5 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -1136,3 +1136,43 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) srv_interfaces[ptlrpc_ninterfaces])); return 0; } + +/* Returns 0 if the service is healthy. + * + * Right now, it just checks to make sure that requests aren't languishing + * in the queue. We'll use this health check to govern whether a node needs + * to be shot, so it's intentionally non-aggressive. */ +int ptlrpc_service_health_check(struct ptlrpc_service *svc) +{ + struct ptlrpc_request *request; + struct timeval right_now; + long timediff, cutoff; + unsigned long flags; + int rc; + + if (svc == NULL) + return 0; + + spin_lock_irqsave(&svc->srv_lock, flags); + if (list_empty(&svc->srv_request_queue)) { + rc = 0; + goto out; + } + + request = list_entry(svc->srv_request_queue.next, + struct ptlrpc_request, rq_list); + + do_gettimeofday(&right_now); + timediff = timeval_sub(&right_now, &request->rq_arrival_time); + + cutoff = obd_health_check_timeout; + + if (timediff / 1000000 > cutoff) { + rc = -1; + goto out; + } + + out: + spin_unlock_irqrestore(&svc->srv_lock, flags); + return rc; +} -- 1.8.3.1