From a594ec4212c0ef4a619a3a3f932b30d0a700b96d Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Fri, 16 Apr 2010 13:30:30 -0700 Subject: [PATCH] b=18948 Speedy recovery Add hard and soft time limits for server recovery. i=andrew.perepechko i=Hongchao.zhang --- lustre/doc/mount.lustre.8 | 11 ++++++++++ lustre/include/lprocfs_status.h | 16 ++++++++------ lustre/include/lustre_disk.h | 3 +++ lustre/include/obd.h | 2 +- lustre/include/obd_support.h | 6 ++++-- lustre/ldlm/ldlm_lib.c | 27 +++++++++++------------ lustre/mdt/mdt_handler.c | 8 +++++++ lustre/obdclass/lprocfs_status.c | 44 ++++++++++++++++++++++++++++++-------- lustre/obdclass/obd_mount.c | 26 ++++++++++++++++++++-- lustre/obdfilter/filter.c | 8 +++++++ lustre/obdfilter/lproc_obdfilter.c | 8 ++++--- 11 files changed, 120 insertions(+), 39 deletions(-) diff --git a/lustre/doc/mount.lustre.8 b/lustre/doc/mount.lustre.8 index 5e67b56..f250ad2 100644 --- a/lustre/doc/mount.lustre.8 +++ b/lustre/doc/mount.lustre.8 @@ -107,6 +107,17 @@ Abort client recovery and start the target service immediately. .BI md_stripe_cache_size Sets the stripe cache size for server side disk with a striped raid configuration. +.TP +.BI recovery_time_soft= timeout +Allow 'timeout' seconds for clients to reconnect for recovery after a server +crash. This timeout will be incrementally extended if it is about to expire +and the server is still handling new connections from recoverable clients. +The default soft recovery timeout is set to 300 seconds (5 minutes). +.TP +.BI recovery_time_hard= timeout +The server will be allowed to incrementally extend its timeout up to a hard +maximum of 'timeout' seconds. The default hard recovery timeout is set to +900 seconds (15 minutes). .SH EXAMPLES .TP .B mount -t lustre cfs21@tcp0:/testfs /mnt/myfilesystem diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index c525517..7157130 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -650,13 +650,15 @@ struct file_operations name##_fops = { \ struct ptlrpc_request; extern void target_print_req(void *seq_file, struct ptlrpc_request *req); -/* lprocfs_status.c: read recovery max time bz13079 */ -int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off, - int count, int *eof, void *data); - -/* lprocfs_status.c: write recovery max time bz13079 */ -int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer, - unsigned long count, void *data); +/* lproc_status.c */ +int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer, + unsigned long count, void *data); +int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer, + unsigned long count, void *data); /* all quota proc functions */ extern int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count, diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 4cbc744..04caa81 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -177,6 +177,8 @@ struct lustre_mount_data { __u32 lmd_flags; /* lustre mount flags */ int lmd_mgs_failnodes; /* mgs failover node count */ int lmd_exclude_count; + int lmd_recovery_time_soft; + int lmd_recovery_time_hard; char *lmd_dev; /* device name */ char *lmd_profile; /* client only */ char *lmd_mgssec; /* sptlrpc flavor to mgs */ @@ -464,6 +466,7 @@ void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)); int lustre_common_put_super(struct super_block *sb); +struct lustre_mount_info *server_find_mount_locked(const char *name); struct lustre_mount_info *server_get_mount(const char *name); struct lustre_mount_info *server_get_mount_2(const char *name); int server_put_mount(const char *name, struct vfsmount *mnt); diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 81e86dc..9b3c538 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1077,7 +1077,7 @@ struct obd_device { cfs_timer_t obd_recovery_timer; time_t obd_recovery_start; /* seconds */ time_t obd_recovery_end; /* seconds, for lprocfs_status */ - time_t obd_recovery_max_time; /* seconds, bz13079 */ + time_t obd_recovery_time_hard; int obd_recovery_timeout; /* new recovery stuff from CMD2 */ diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 87798f8..bd83911 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -114,9 +114,11 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_TIMEOUT_DEFAULT 100 #define LDLM_TIMEOUT_DEFAULT 20 #define MDS_LDLM_TIMEOUT_DEFAULT 6 -/* Time to wait for all clients to reconnect during recovery */ +/* Time to wait for all clients to reconnect during recovery (hard limit) */ +#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) +/* Time to wait for all clients to reconnect during recovery (soft limit) */ /* Should be very conservative; must catch the first reconnect after reboot */ -#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */ +#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) /* Change recovery-small 26b time if you change this */ #define PING_INTERVAL max(obd_timeout / 4, 1U) /* Client may skip 1 ping; we must wait at least 2.5. But for multiple diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 3b4479b..50795ce 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1328,15 +1328,11 @@ static void reset_recovery_timer(struct obd_device *obd, int duration, else if (!extend && (duration > obd->obd_recovery_timeout)) /* Track the client's largest expected replay time */ obd->obd_recovery_timeout = duration; -#ifdef CRAY_XT3 - /* - * If total recovery time already exceed the - * obd_recovery_max_time, then CRAY XT3 will - * abort the recovery - */ - if(obd->obd_recovery_timeout > obd->obd_recovery_max_time) - obd->obd_recovery_timeout = obd->obd_recovery_max_time; -#endif + + /* Hard limit of obd_recovery_time_hard which should not happen */ + if (obd->obd_recovery_timeout > obd->obd_recovery_time_hard) + obd->obd_recovery_timeout = obd->obd_recovery_time_hard; + obd->obd_recovery_end = obd->obd_recovery_start + obd->obd_recovery_timeout; if (!cfs_timer_is_armed(&obd->obd_recovery_timer) || @@ -1358,8 +1354,6 @@ static void check_and_start_recovery_timer(struct obd_device *obd) } CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name); obd->obd_recovery_start = cfs_time_current_sec(); - /* minimum */ - obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; cfs_spin_unlock_bh(&obd->obd_processing_task_lock); reset_recovery_timer(obd, obd->obd_recovery_timeout, 0); @@ -1807,7 +1801,7 @@ static int target_recovery_thread(void *arg) delta = (jiffies - delta) / CFS_HZ; CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n", delta, obd->obd_replayed_requests, obd->obd_replayed_locks); - if (delta > obd_timeout * OBD_RECOVERY_FACTOR) { + if (delta > OBD_RECOVERY_TIME_SOFT) { CWARN("too long recovery - read logs\n"); libcfs_debug_dumplog(); } @@ -1898,9 +1892,12 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler) obd->obd_next_recovery_transno = obd->obd_last_committed + 1; obd->obd_recovery_start = 0; obd->obd_recovery_end = 0; - obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; - /* bz13079: this should be set to desired value for ost but not for mds */ - obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME; + + /* both values can be get from mount data already */ + if (obd->obd_recovery_timeout == 0) + obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT; + if (obd->obd_recovery_time_hard == 0) + obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD; cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd); target_start_recovery_thread(lut, handler); } diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 63856b9..ab757f6 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -4526,6 +4526,14 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, CERROR("CMD Operation not allowed in IOP mode\n"); GOTO(err_lmi, rc = -EINVAL); } + /* Read recovery timeouts */ + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft) + obd->obd_recovery_timeout = + lsi->lsi_lmd->lmd_recovery_time_soft; + + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard) + obd->obd_recovery_time_hard = + lsi->lsi_lmd->lmd_recovery_time_hard; } cfs_rwlock_init(&m->mdt_sptlrpc_lock); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index e4b063e..8dfd79e 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -2237,18 +2237,45 @@ out: } EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status); -int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off, - int count, int *eof, void *data) +int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%d\n", + obd->obd_recovery_timeout); +} +EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft); + +int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd->obd_recovery_timeout = val; + return count; +} +EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft); + +int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off, + int count, int *eof, void *data) { struct obd_device *obd = data; LASSERT(obd != NULL); - return snprintf(page, count, "%lu\n", obd->obd_recovery_max_time); + return snprintf(page, count, "%lu\n", obd->obd_recovery_time_hard); } -EXPORT_SYMBOL(lprocfs_obd_rd_recovery_maxtime); +EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard); -int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer, - unsigned long count, void *data) +int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer, + unsigned long count, void *data) { struct obd_device *obd = data; int val, rc; @@ -2258,11 +2285,10 @@ int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer, if (rc) return rc; - obd->obd_recovery_max_time = val; + obd->obd_recovery_time_hard = val; return count; } -EXPORT_SYMBOL(lprocfs_obd_wr_recovery_maxtime); - +EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard); EXPORT_SYMBOL(lprocfs_register); EXPORT_SYMBOL(lprocfs_srch); diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index b8c7e7d..3d211be 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -1199,6 +1199,8 @@ struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) } lsi->lsi_lmd->lmd_exclude_count = 0; + lsi->lsi_lmd->lmd_recovery_time_soft = 0; + lsi->lsi_lmd->lmd_recovery_time_hard = 0; s2lsi_nocast(sb) = lsi; /* we take 1 extra ref for our setup */ cfs_atomic_set(&lsi->lsi_mounts, 1); @@ -1748,7 +1750,6 @@ int lustre_common_put_super(struct super_block *sb) RETURN(rc); } -#if 0 static void lmd_print(struct lustre_mount_data *lmd) { int i; @@ -1758,14 +1759,23 @@ static void lmd_print(struct lustre_mount_data *lmd) PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile); PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev); PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags); + if (lmd->lmd_opts) PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts); + + if (lmd->lmd_recovery_time_soft) + PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n", + lmd->lmd_recovery_time_soft); + + if (lmd->lmd_recovery_time_hard) + PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n", + lmd->lmd_recovery_time_hard); + for (i = 0; i < lmd->lmd_exclude_count; i++) { PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i, lmd->lmd_exclude[i]); } } -#endif /* Is this server on the exclusion list */ int lustre_check_exclusion(struct super_block *sb, char *svname) @@ -1902,6 +1912,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) s1 = options; while (*s1) { int clear = 0; + int time_min = 2 * (CONNECTION_SWITCH_MAX + + 2 * INITIAL_CONNECT_TIMEOUT); + /* Skip whitespace and extra commas */ while (*s1 == ' ' || *s1 == ',') s1++; @@ -1914,6 +1927,14 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) if (strncmp(s1, "abort_recov", 11) == 0) { lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; clear++; + } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { + lmd->lmd_recovery_time_soft = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { + lmd->lmd_recovery_time_hard = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; } else if (strncmp(s1, "nosvc", 5) == 0) { lmd->lmd_flags |= LMD_FLG_NOSVC; clear++; @@ -1993,6 +2014,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) strcpy(lmd->lmd_opts, options); } + lmd_print(lmd); lmd->lmd_magic = LMD_MAGIC; RETURN(rc); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 4169e19..a88bd4ae 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1988,6 +1988,14 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb); mnt = lmi->lmi_mnt; obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd)); + + /* gets recovery timeouts from mount data */ + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft) + obd->obd_recovery_timeout = + lsi->lsi_lmd->lmd_recovery_time_soft; + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard) + obd->obd_recovery_time_hard = + lsi->lsi_lmd->lmd_recovery_time_hard; } else { /* old path - used by lctl */ CERROR("Using old MDS mount method\n"); diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index e49386e..90fec4e 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -382,9 +382,11 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = { { "tot_pending", lprocfs_filter_rd_tot_pending, 0, 0 }, { "tot_granted", lprocfs_filter_rd_tot_granted, 0, 0 }, { "hash_stats", lprocfs_obd_rd_hash, 0, 0 }, - { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, - { "recovery_maxtime", lprocfs_obd_rd_recovery_maxtime, - lprocfs_obd_wr_recovery_maxtime, 0}, + { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, + { "recovery_time_soft", lprocfs_obd_rd_recovery_time_soft, + lprocfs_obd_wr_recovery_time_soft, 0}, + { "recovery_time_hard", lprocfs_obd_rd_recovery_time_hard, + lprocfs_obd_wr_recovery_time_hard, 0}, { "evict_client", 0, lprocfs_wr_evict_client, 0, &lprocfs_evict_client_fops}, { "num_exports", lprocfs_rd_num_exports, 0, 0 }, -- 1.8.3.1