From ea1aa75ddbf2c4b28aa40dcff4e91d2339e2dbb4 Mon Sep 17 00:00:00 2001 From: anserper Date: Tue, 21 Jul 2009 14:03:56 +0000 Subject: [PATCH] b=18948 i=Nathan Rutman i=Alexander Zarochentsev tunably recovery patch --- lustre/doc/mount.lustre.8 | 11 ++++++++++ lustre/include/lprocfs_status.h | 16 +++++++------- lustre/include/lustre_disk.h | 3 +++ lustre/include/obd.h | 4 +--- lustre/include/obd_support.h | 9 ++++---- lustre/ldlm/ldlm_lib.c | 18 +++++---------- lustre/mds/mds_fs.c | 23 ++++++++++++++----- lustre/obdclass/lprocfs_status.c | 45 +++++++++++++++++++++++++++++--------- lustre/obdclass/obd_mount.c | 36 +++++++++++++++++++++++++++++- lustre/obdfilter/filter.c | 23 ++++++++++++++----- lustre/obdfilter/lproc_obdfilter.c | 10 ++++----- 11 files changed, 144 insertions(+), 54 deletions(-) diff --git a/lustre/doc/mount.lustre.8 b/lustre/doc/mount.lustre.8 index 5e67b56..f250ad2 100644 --- a/lustre/doc/mount.lustre.8 +++ b/lustre/doc/mount.lustre.8 @@ -107,6 +107,17 @@ Abort client recovery and start the target service immediately. .BI md_stripe_cache_size Sets the stripe cache size for server side disk with a striped raid configuration. +.TP +.BI recovery_time_soft= timeout +Allow 'timeout' seconds for clients to reconnect for recovery after a server +crash. This timeout will be incrementally extended if it is about to expire +and the server is still handling new connections from recoverable clients. +The default soft recovery timeout is set to 300 seconds (5 minutes). +.TP +.BI recovery_time_hard= timeout +The server will be allowed to incrementally extend its timeout up to a hard +maximum of 'timeout' seconds. The default hard recovery timeout is set to +900 seconds (15 minutes). .SH EXAMPLES .TP .B mount -t lustre cfs21@tcp0:/testfs /mnt/myfilesystem diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 7106e91..175ddc1 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -657,15 +657,15 @@ struct file_operations name##_fops = { \ struct ptlrpc_request; extern void target_print_req(void *seq_file, struct ptlrpc_request *req); -#ifdef CRAY_XT3 -/* lprocfs_status.c: read recovery max time bz13079 */ -int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off, - int count, int *eof, void *data); +int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer, + unsigned long count, void *data); +int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer, + unsigned long count, void *data); -/* lprocfs_status.c: write recovery max time bz13079 */ -int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer, - unsigned long count, void *data); -#endif #ifdef HAVE_DELAYED_RECOVERY int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off, int count, int *eof, void *data); diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 19092ea..73ace63 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -156,6 +156,8 @@ struct lustre_mount_data { __u32 lmd_flags; /* lustre mount flags */ int lmd_mgs_failnodes; /* mgs failover node count */ int lmd_exclude_count; + int lmd_recovery_time_soft; + int lmd_recovery_time_hard; char *lmd_dev; /* device name */ char *lmd_profile; /* client only */ char *lmd_opts; /* lustre mount options (as opposed to @@ -316,6 +318,7 @@ int lustre_process_log(struct super_block *sb, char *logname, struct config_llog_instance *cfg); int lustre_end_log(struct super_block *sb, char *logname, struct config_llog_instance *cfg); +struct lustre_mount_info *server_find_mount_locked(char *name); struct lustre_mount_info *server_get_mount(char *name); int server_put_mount(char *name, struct vfsmount *mnt); int server_register_target(struct super_block *sb); diff --git a/lustre/include/obd.h b/lustre/include/obd.h index d7279cd..a2bc1fa 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -991,9 +991,7 @@ struct obd_device { struct list_head obd_delayed_reply_queue; time_t obd_recovery_start; /* seconds */ time_t obd_recovery_end; /* seconds, for lprocfs_status */ -#ifdef CRAY_XT3 - time_t obd_recovery_max_time; /* seconds, bz13079 */ -#endif + time_t obd_recovery_time_hard; int obd_recovery_timeout; union { diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 6532294..b50e3bb 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -80,12 +80,11 @@ extern unsigned int obd_alloc_fail_rate; #else #define STALE_EXPORT_MAXTIME_DEFAULT (0) /**< zero if no delayed recovery */ #endif -#ifdef CRAY_XT3 - #define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ -#endif -/* Time to wait for all clients to reconnect during recovery */ +/* Time to wait for all clients to reconnect during recovery (hard limit) */ +#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) +/* Time to wait for all clients to reconnect during recovery (soft limit) */ /* Should be very conservative; must catch the first reconnect after reboot */ -#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */ +#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) /* Change recovery-small 26b time if you change this */ #define PING_INTERVAL max(obd_timeout / 4, 1U) /* a bit more than maximal journal commit time in seconds */ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 0b305b9..3d0b72e 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -612,7 +612,7 @@ int target_recovery_check_and_stop(struct obd_device *obd) obd->obd_version_recov = 1; spin_unlock_bh(&obd->obd_processing_task_lock); /* reset timer, recovery will proceed with versions now */ - reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout, 1); + reset_recovery_timer(obd, OBD_RECOVERY_TIME_SOFT, 1); return 0; } EXPORT_SYMBOL(target_recovery_check_and_stop); @@ -1339,15 +1339,11 @@ static void reset_recovery_timer(struct obd_device *obd, int duration, else if (!extend && (duration > obd->obd_recovery_timeout)) /* Track the client's largest expected replay time */ obd->obd_recovery_timeout = duration; -#ifdef CRAY_XT3 - /* - * If total recovery time already exceed the - * obd_recovery_max_time, then CRAY XT3 will - * abort the recovery - */ - if(obd->obd_recovery_timeout > obd->obd_recovery_max_time) - obd->obd_recovery_timeout = obd->obd_recovery_max_time; -#endif + + /* Hard limit of obd_recovery_time_hard which should not happen */ + if(obd->obd_recovery_timeout > obd->obd_recovery_time_hard) + obd->obd_recovery_timeout = obd->obd_recovery_time_hard; + obd->obd_recovery_end = obd->obd_recovery_start + obd->obd_recovery_timeout; if (cfs_time_before(now, obd->obd_recovery_end)) { @@ -1369,8 +1365,6 @@ static void check_and_start_recovery_timer(struct obd_device *obd, } CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name); obd->obd_recovery_start = cfs_time_current_sec(); - /* minimum */ - obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; obd->obd_recovery_handler = handler; cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd); spin_unlock_bh(&obd->obd_processing_task_lock); diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index c69228e..97e6c38 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -463,6 +463,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) struct mds_obd *mds = &obd->u.mds; struct lr_server_data *lsd; struct lsd_client_data *lcd = NULL; + struct lustre_mount_info *lmi; loff_t off = 0; unsigned long last_rcvd_size = i_size_read(file->f_dentry->d_inode); __u64 mount_count; @@ -686,16 +687,28 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) obd->obd_recovering = 1; obd->obd_recovery_start = 0; obd->obd_recovery_end = 0; - obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; -#ifdef CRAY_XT3 - /* bz13079: this won't be changed for mds */ - obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME; -#endif } else { LASSERT(!obd->obd_recovering); /* VBR: update boot epoch after recovery */ mds_update_last_epoch(obd); } + + obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT; + obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD; + + lmi = server_find_mount_locked(obd->obd_name); + if (lmi) { + struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb); + + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft) + obd->obd_recovery_timeout = + lsi->lsi_lmd->lmd_recovery_time_soft; + + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard) + obd->obd_recovery_time_hard = + lsi->lsi_lmd->lmd_recovery_time_hard; + } + mds->mds_mount_count = mount_count + 1; lsd->lsd_mount_count = lsd->lsd_compat14 = cpu_to_le64(mds->mds_mount_count); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index d2a5dcf..1f99bf9 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -2111,20 +2111,46 @@ int lprocfs_obd_rd_hash(char *page, char **start, off_t off, } EXPORT_SYMBOL(lprocfs_obd_rd_hash); -#ifdef CRAY_XT3 -int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off, - int count, int *eof, void *data) +int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%d\n", + obd->obd_recovery_timeout); +} +EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft); + +int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd->obd_recovery_timeout = val; + return count; +} +EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft); + +int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off, + int count, int *eof, void *data) { struct obd_device *obd = (struct obd_device *)data; LASSERT(obd != NULL); return snprintf(page, count, "%lu\n", - obd->obd_recovery_max_time); + obd->obd_recovery_time_hard); } -EXPORT_SYMBOL(lprocfs_obd_rd_recovery_maxtime); +EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard); -int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer, - unsigned long count, void *data) +int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer, + unsigned long count, void *data) { struct obd_device *obd = (struct obd_device *)data; int val, rc; @@ -2134,11 +2160,10 @@ int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer, if (rc) return rc; - obd->obd_recovery_max_time = val; + obd->obd_recovery_time_hard = val; return count; } -EXPORT_SYMBOL(lprocfs_obd_wr_recovery_maxtime); -#endif /* CRAY_XT3 */ +EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard); #ifdef HAVE_DELAYED_RECOVERY int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off, diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 4d8f3c0..0bc3729 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -78,6 +78,17 @@ static struct lustre_mount_info *server_find_mount(char *name) RETURN(NULL); } +struct lustre_mount_info *server_find_mount_locked(char *name) +{ + struct lustre_mount_info *lmi; + + down(&lustre_mount_info_lock); + lmi = server_find_mount(name); + up(&lustre_mount_info_lock); + + return lmi; +} + /* we must register an obd for a mount before we call the setup routine. *_setup will call lustre_get_mount to get the mnt struct by obd_name, since we can't pass the pointer to setup. */ @@ -228,7 +239,6 @@ int server_put_mount(char *name, struct vfsmount *mnt) RETURN(0); } - /******* mount helper utilities *********/ static void ldd_print(struct lustre_disk_data *ldd) @@ -1160,6 +1170,8 @@ struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) } lsi->lsi_lmd->lmd_exclude_count = 0; + lsi->lsi_lmd->lmd_recovery_time_soft = 0; + lsi->lsi_lmd->lmd_recovery_time_hard = 0; s2lsi_nocast(sb) = lsi; /* we take 1 extra ref for our setup */ atomic_set(&lsi->lsi_mounts, 1); @@ -1717,8 +1729,18 @@ static void lmd_print(struct lustre_mount_data *lmd) PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile); PRINT_CMD(PRINT_MASK, "device: %s\n", lmd->lmd_dev); PRINT_CMD(PRINT_MASK, "flags: %x\n", lmd->lmd_flags); + if (lmd->lmd_opts) PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts); + + if (lmd->lmd_recovery_time_soft) + PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n", + lmd->lmd_recovery_time_soft); + + if (lmd->lmd_recovery_time_hard) + PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n", + lmd->lmd_recovery_time_hard); + for (i = 0; i < lmd->lmd_exclude_count; i++) { PRINT_CMD(PRINT_MASK, "exclude %d: OST%04x\n", i, lmd->lmd_exclude[i]); @@ -1836,6 +1858,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) s1 = options; while (*s1) { int clear = 0; + int time_min = 2 * (CONNECTION_SWITCH_MAX + + 2 * INITIAL_CONNECT_TIMEOUT); + /* Skip whitespace and extra commas */ while (*s1 == ' ' || *s1 == ',') s1++; @@ -1848,6 +1873,14 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) if (strncmp(s1, "abort_recov", 11) == 0) { lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; clear++; + } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { + lmd->lmd_recovery_time_soft = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { + lmd->lmd_recovery_time_hard = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; } else if (strncmp(s1, "nosvc", 5) == 0) { lmd->lmd_flags |= LMD_FLG_NOSVC; clear++; @@ -2073,6 +2106,7 @@ EXPORT_SYMBOL(lustre_register_kill_super_cb); EXPORT_SYMBOL(lustre_common_put_super); EXPORT_SYMBOL(lustre_process_log); EXPORT_SYMBOL(lustre_end_log); +EXPORT_SYMBOL(server_find_mount_locked); EXPORT_SYMBOL(server_get_mount); EXPORT_SYMBOL(server_put_mount); EXPORT_SYMBOL(server_register_target); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 500d220..e3fa98e 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -795,6 +795,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) struct lsd_client_data *lcd = NULL; struct inode *inode = filp->f_dentry->d_inode; unsigned long last_rcvd_size = i_size_read(inode); + struct lustre_mount_info *lmi; __u64 mount_count; __u32 start_epoch; int cl_idx; @@ -1000,16 +1001,28 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) obd->obd_recovering = 1; obd->obd_recovery_start = 0; obd->obd_recovery_end = 0; - obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout; -#ifdef CRAY_XT3 - /* b13079: this should be set to desired value for ost */ - obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME; -#endif } else { LASSERT(!obd->obd_recovering); /* VBR: update boot epoch after recovery */ filter_update_last_epoch(obd); } + + obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT; + obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD; + + lmi = server_find_mount_locked(obd->obd_name); + if (lmi) { + struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb); + + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft) + obd->obd_recovery_timeout = + lsi->lsi_lmd->lmd_recovery_time_soft; + + if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard) + obd->obd_recovery_time_hard = + lsi->lsi_lmd->lmd_recovery_time_hard; + } + out: filter->fo_mount_count = mount_count + 1; fsd->lsd_mount_count = cpu_to_le64(filter->fo_mount_count); diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index 3978b09..44cfed1 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -304,12 +304,12 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = { { "tot_dirty", lprocfs_filter_rd_tot_dirty, 0, 0 }, { "tot_pending", lprocfs_filter_rd_tot_pending, 0, 0 }, { "tot_granted", lprocfs_filter_rd_tot_granted, 0, 0 }, - { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, + { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, + { "recovery_time_soft", lprocfs_obd_rd_recovery_time_soft, + lprocfs_obd_wr_recovery_time_soft, 0}, + { "recovery_time_hard", lprocfs_obd_rd_recovery_time_hard, + lprocfs_obd_wr_recovery_time_hard, 0}, { "hash_stats", lprocfs_obd_rd_hash, 0, 0 }, -#ifdef CRAY_XT3 - { "recovery_maxtime", lprocfs_obd_rd_recovery_maxtime, - lprocfs_obd_wr_recovery_maxtime, 0}, -#endif { "evict_client", 0, lprocfs_wr_evict_client, 0, &lprocfs_evict_client_fops}, { "num_exports", lprocfs_rd_num_exports, 0, 0 }, -- 1.8.3.1