Whamcloud - gitweb
b=18948
authoranserper <anserper>
Thu, 25 Jun 2009 07:06:46 +0000 (07:06 +0000)
committeranserper <anserper>
Thu, 25 Jun 2009 07:06:46 +0000 (07:06 +0000)
i=Nathan Rutman
i=Andreas Dilger

Tunable recovery timeouts (v1)

lustre/doc/mount.lustre.8
lustre/include/lprocfs_status.h
lustre/include/lustre_disk.h
lustre/include/obd.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/mds/mds_fs.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/obd_mount.c
lustre/obdfilter/filter.c
lustre/obdfilter/lproc_obdfilter.c

index 5e67b56..f250ad2 100644 (file)
@@ -107,6 +107,17 @@ Abort client recovery and start the target service immediately.
 .BI md_stripe_cache_size
 Sets the stripe cache size for server side disk with a striped raid
 configuration.
+.TP
+.BI recovery_time_soft= timeout
+Allow 'timeout' seconds for clients to reconnect for recovery after a server
+crash.  This timeout will be incrementally extended if it is about to expire
+and the server is still handling new connections from recoverable clients.
+The default soft recovery timeout is set to 300 seconds (5 minutes).
+.TP
+.BI recovery_time_hard= timeout
+The server will be allowed to incrementally extend its timeout up to a hard
+maximum of 'timeout' seconds.  The default hard recovery timeout is set to
+900 seconds (15 minutes).
 .SH EXAMPLES
 .TP
 .B mount -t lustre cfs21@tcp0:/testfs /mnt/myfilesystem
index 7106e91..175ddc1 100644 (file)
@@ -657,15 +657,15 @@ struct file_operations name##_fops = {                                     \
 struct ptlrpc_request;
 extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
 
-#ifdef CRAY_XT3
-/* lprocfs_status.c: read recovery max time bz13079 */
-int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data);
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
 
-/* lprocfs_status.c: write recovery max time bz13079 */
-int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
-                                    unsigned long count, void *data);
-#endif
 #ifdef HAVE_DELAYED_RECOVERY
 int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off,
                                     int count, int *eof, void *data);
index 1e55a80..dc4c24b 100644 (file)
@@ -156,6 +156,8 @@ struct lustre_mount_data {
         __u32      lmd_flags;         /* lustre mount flags */
         int        lmd_mgs_failnodes; /* mgs failover node count */
         int        lmd_exclude_count;
+        int        lmd_recovery_time_soft;
+        int        lmd_recovery_time_hard;
         char      *lmd_dev;           /* device name */
         char      *lmd_profile;       /* client only */
         char      *lmd_opts;          /* lustre mount options (as opposed to 
index 928c5f8..1cfb0c4 100644 (file)
@@ -990,9 +990,7 @@ struct obd_device {
         struct list_head                 obd_delayed_reply_queue;
         time_t                           obd_recovery_start; /* seconds */
         time_t                           obd_recovery_end; /* seconds, for lprocfs_status */
-#ifdef CRAY_XT3
-        time_t                           obd_recovery_max_time; /* seconds, bz13079 */
-#endif
+        time_t                           obd_recovery_time_hard;
         int                              obd_recovery_timeout;
 
         union {
index 6532294..b50e3bb 100644 (file)
@@ -80,12 +80,11 @@ extern unsigned int obd_alloc_fail_rate;
 #else
 #define STALE_EXPORT_MAXTIME_DEFAULT    (0) /**< zero if no delayed recovery */
 #endif
-#ifdef CRAY_XT3
- #define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
-#endif
-/* Time to wait for all clients to reconnect during recovery */
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD          (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
 /* Should be very conservative; must catch the first reconnect after reboot */
-#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */
+#define OBD_RECOVERY_TIME_SOFT          (obd_timeout * 3)
 /* Change recovery-small 26b time if you change this */
 #define PING_INTERVAL max(obd_timeout / 4, 1U)
 /* a bit more than maximal journal commit time in seconds */
index d18f8bc..8f6b292 100644 (file)
@@ -614,7 +614,7 @@ int target_recovery_check_and_stop(struct obd_device *obd)
         obd->obd_version_recov = 1;
         spin_unlock_bh(&obd->obd_processing_task_lock);
         /* reset timer, recovery will proceed with versions now */
-        reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout, 1);
+        reset_recovery_timer(obd, OBD_RECOVERY_TIME_SOFT, 1);
         return 0;
 }
 EXPORT_SYMBOL(target_recovery_check_and_stop);
@@ -1335,15 +1335,11 @@ static void reset_recovery_timer(struct obd_device *obd, int duration,
         else if (!extend && (duration > obd->obd_recovery_timeout))
                 /* Track the client's largest expected replay time */
                 obd->obd_recovery_timeout = duration;
-#ifdef CRAY_XT3
-        /*
-         * If total recovery time already exceed the
-         * obd_recovery_max_time, then CRAY XT3 will
-         * abort the recovery
-         */
-        if(obd->obd_recovery_timeout > obd->obd_recovery_max_time)
-                obd->obd_recovery_timeout = obd->obd_recovery_max_time;
-#endif
+
+        /* Hard limit of obd_recovery_time_hard which should not happen */
+        if(obd->obd_recovery_timeout > obd->obd_recovery_time_hard)
+                obd->obd_recovery_timeout = obd->obd_recovery_time_hard;
+
         obd->obd_recovery_end = obd->obd_recovery_start +
                                 obd->obd_recovery_timeout;
         if (cfs_time_before(now, obd->obd_recovery_end)) {
@@ -1365,8 +1361,6 @@ static void check_and_start_recovery_timer(struct obd_device *obd,
         }
         CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
-        /* minimum */
-        obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
         obd->obd_recovery_handler = handler;
         cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
         spin_unlock_bh(&obd->obd_processing_task_lock);
index a48a69c..f7abf14 100644 (file)
@@ -463,6 +463,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
         struct mds_obd *mds = &obd->u.mds;
         struct lr_server_data *lsd;
         struct lsd_client_data *lcd = NULL;
+        struct lustre_mount_info *lmi;
         loff_t off = 0;
         unsigned long last_rcvd_size = i_size_read(file->f_dentry->d_inode);
         __u64 mount_count;
@@ -676,16 +677,30 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                 obd->obd_recovering = 1;
                 obd->obd_recovery_start = 0;
                 obd->obd_recovery_end = 0;
-                obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
-#ifdef CRAY_XT3
-                /* bz13079: this won't be changed for mds */
-                obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
-#endif
         } else {
                 LASSERT(!obd->obd_recovering);
                 /* VBR: update boot epoch after recovery */
                 mds_update_last_epoch(obd);
         }
+
+        obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT;
+        obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD;
+
+        lmi = server_get_mount(obd->obd_name);
+        if (lmi) {
+                struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft)
+                        obd->obd_recovery_timeout =
+                                lsi->lsi_lmd->lmd_recovery_time_soft;
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard)
+                        obd->obd_recovery_time_hard =
+                                lsi->lsi_lmd->lmd_recovery_time_hard;
+
+                server_put_mount(obd->obd_name, lmi->lmi_mnt);
+        }
+
         mds->mds_mount_count = mount_count + 1;
         lsd->lsd_mount_count = lsd->lsd_compat14 =
                 cpu_to_le64(mds->mds_mount_count);
index 17afa31..b10eb85 100644 (file)
@@ -2111,20 +2111,46 @@ int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
 }
 EXPORT_SYMBOL(lprocfs_obd_rd_hash);
 
-#ifdef CRAY_XT3
-int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data)
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%d\n",
+                        obd->obd_recovery_timeout);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft);
+
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        obd->obd_recovery_timeout = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft);
+
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
 {
         struct obd_device *obd = (struct obd_device *)data;
         LASSERT(obd != NULL);
 
         return snprintf(page, count, "%lu\n",
-                        obd->obd_recovery_max_time);
+                        obd->obd_recovery_time_hard);
 }
-EXPORT_SYMBOL(lprocfs_obd_rd_recovery_maxtime);
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard);
 
-int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
-                                    unsigned long count, void *data)
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
 {
         struct obd_device *obd = (struct obd_device *)data;
         int val, rc;
@@ -2134,11 +2160,10 @@ int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
         if (rc)
                 return rc;
 
-        obd->obd_recovery_max_time = val;
+        obd->obd_recovery_time_hard = val;
         return count;
 }
-EXPORT_SYMBOL(lprocfs_obd_wr_recovery_maxtime);
-#endif /* CRAY_XT3 */
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard);
 
 #ifdef HAVE_DELAYED_RECOVERY
 int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off,
index 4d8f3c0..37e6402 100644 (file)
@@ -1160,6 +1160,8 @@ struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
         }
 
         lsi->lsi_lmd->lmd_exclude_count = 0;
+        lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+        lsi->lsi_lmd->lmd_recovery_time_hard = 0;
         s2lsi_nocast(sb) = lsi;
         /* we take 1 extra ref for our setup */
         atomic_set(&lsi->lsi_mounts, 1);
@@ -1717,8 +1719,18 @@ static void lmd_print(struct lustre_mount_data *lmd)
                 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
         PRINT_CMD(PRINT_MASK, "device:  %s\n", lmd->lmd_dev);
         PRINT_CMD(PRINT_MASK, "flags:   %x\n", lmd->lmd_flags);
+
         if (lmd->lmd_opts)
                 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
+
+        if (lmd->lmd_recovery_time_soft)
+                PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
+                          lmd->lmd_recovery_time_soft);
+
+        if (lmd->lmd_recovery_time_hard)
+                PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
+                          lmd->lmd_recovery_time_hard);
+
         for (i = 0; i < lmd->lmd_exclude_count; i++) {
                 PRINT_CMD(PRINT_MASK, "exclude %d:  OST%04x\n", i,
                           lmd->lmd_exclude[i]);
@@ -1836,6 +1848,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
         s1 = options;
         while (*s1) {
                 int clear = 0;
+                int time_min = 2 * (CONNECTION_SWITCH_MAX +
+                               2 * INITIAL_CONNECT_TIMEOUT);
+
                 /* Skip whitespace and extra commas */
                 while (*s1 == ' ' || *s1 == ',')
                         s1++;
@@ -1848,6 +1863,14 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
                 if (strncmp(s1, "abort_recov", 11) == 0) {
                         lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
                         clear++;
+                } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+                        lmd->lmd_recovery_time_soft = max_t(int,
+                                simple_strtoul(s1 + 19, NULL, 10), time_min);
+                        clear++;
+                } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+                        lmd->lmd_recovery_time_hard = max_t(int,
+                                simple_strtoul(s1 + 19, NULL, 10), time_min);
+                        clear++;
                 } else if (strncmp(s1, "nosvc", 5) == 0) {
                         lmd->lmd_flags |= LMD_FLG_NOSVC;
                         clear++;
index a454fc5..b37ca7a 100644 (file)
@@ -795,6 +795,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
         struct lsd_client_data *lcd = NULL;
         struct inode *inode = filp->f_dentry->d_inode;
         unsigned long last_rcvd_size = i_size_read(inode);
+        struct lustre_mount_info *lmi;
         __u64 mount_count;
         __u32 start_epoch;
         int cl_idx;
@@ -1000,16 +1001,30 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 obd->obd_recovering = 1;
                 obd->obd_recovery_start = 0;
                 obd->obd_recovery_end = 0;
-                obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
-#ifdef CRAY_XT3
-                /* b13079: this should be set to desired value for ost */
-                obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
-#endif
         } else {
                 LASSERT(!obd->obd_recovering);
                 /* VBR: update boot epoch after recovery */
                 filter_update_last_epoch(obd);
         }
+
+        obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT;
+        obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD;
+
+        lmi = server_get_mount(obd->obd_name);
+        if (lmi) {
+                struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft)
+                        obd->obd_recovery_timeout =
+                                lsi->lsi_lmd->lmd_recovery_time_soft;
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard)
+                        obd->obd_recovery_time_hard =
+                                lsi->lsi_lmd->lmd_recovery_time_hard;
+
+                server_put_mount(obd->obd_name, lmi->lmi_mnt);
+        }
+
 out:
         filter->fo_mount_count = mount_count + 1;
         fsd->lsd_mount_count = cpu_to_le64(filter->fo_mount_count);
index dc2a396..0d19f53 100644 (file)
@@ -280,12 +280,12 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
         { "tot_dirty",    lprocfs_filter_rd_tot_dirty,   0, 0 },
         { "tot_pending",  lprocfs_filter_rd_tot_pending, 0, 0 },
         { "tot_granted",  lprocfs_filter_rd_tot_granted, 0, 0 },
-        { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 },
+        { "recovery_status",    lprocfs_obd_rd_recovery_status, 0, 0 },
+        { "recovery_time_soft", lprocfs_obd_rd_recovery_time_soft,
+                                lprocfs_obd_wr_recovery_time_soft, 0},
+        { "recovery_time_hard", lprocfs_obd_rd_recovery_time_hard,
+                                lprocfs_obd_wr_recovery_time_hard, 0},
         { "hash_stats",   lprocfs_obd_rd_hash,      0, 0 },
-#ifdef CRAY_XT3
-        { "recovery_maxtime", lprocfs_obd_rd_recovery_maxtime,
-                              lprocfs_obd_wr_recovery_maxtime, 0},
-#endif
         { "evict_client", 0, lprocfs_wr_evict_client, 0,
                                 &lprocfs_evict_client_fops},
         { "num_exports",  lprocfs_rd_num_exports,   0, 0 },