Whamcloud - gitweb
b=18948
authoranserper <anserper>
Tue, 21 Jul 2009 14:03:56 +0000 (14:03 +0000)
committeranserper <anserper>
Tue, 21 Jul 2009 14:03:56 +0000 (14:03 +0000)
i=Nathan Rutman
i=Alexander Zarochentsev

tunably recovery patch

lustre/doc/mount.lustre.8
lustre/include/lprocfs_status.h
lustre/include/lustre_disk.h
lustre/include/obd.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/mds/mds_fs.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/obd_mount.c
lustre/obdfilter/filter.c
lustre/obdfilter/lproc_obdfilter.c

index 5e67b56..f250ad2 100644 (file)
@@ -107,6 +107,17 @@ Abort client recovery and start the target service immediately.
 .BI md_stripe_cache_size
 Sets the stripe cache size for server side disk with a striped raid
 configuration.
+.TP
+.BI recovery_time_soft= timeout
+Allow 'timeout' seconds for clients to reconnect for recovery after a server
+crash.  This timeout will be incrementally extended if it is about to expire
+and the server is still handling new connections from recoverable clients.
+The default soft recovery timeout is set to 300 seconds (5 minutes).
+.TP
+.BI recovery_time_hard= timeout
+The server will be allowed to incrementally extend its timeout up to a hard
+maximum of 'timeout' seconds.  The default hard recovery timeout is set to
+900 seconds (15 minutes).
 .SH EXAMPLES
 .TP
 .B mount -t lustre cfs21@tcp0:/testfs /mnt/myfilesystem
index 7106e91..175ddc1 100644 (file)
@@ -657,15 +657,15 @@ struct file_operations name##_fops = {                                     \
 struct ptlrpc_request;
 extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
 
-#ifdef CRAY_XT3
-/* lprocfs_status.c: read recovery max time bz13079 */
-int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data);
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
 
-/* lprocfs_status.c: write recovery max time bz13079 */
-int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
-                                    unsigned long count, void *data);
-#endif
 #ifdef HAVE_DELAYED_RECOVERY
 int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off,
                                     int count, int *eof, void *data);
index 19092ea..73ace63 100644 (file)
@@ -156,6 +156,8 @@ struct lustre_mount_data {
         __u32      lmd_flags;         /* lustre mount flags */
         int        lmd_mgs_failnodes; /* mgs failover node count */
         int        lmd_exclude_count;
+        int        lmd_recovery_time_soft;
+        int        lmd_recovery_time_hard;
         char      *lmd_dev;           /* device name */
         char      *lmd_profile;       /* client only */
         char      *lmd_opts;          /* lustre mount options (as opposed to 
@@ -316,6 +318,7 @@ int lustre_process_log(struct super_block *sb, char *logname,
                      struct config_llog_instance *cfg);
 int lustre_end_log(struct super_block *sb, char *logname, 
                        struct config_llog_instance *cfg);
+struct lustre_mount_info *server_find_mount_locked(char *name);
 struct lustre_mount_info *server_get_mount(char *name);
 int server_put_mount(char *name, struct vfsmount *mnt);
 int server_register_target(struct super_block *sb);
index d7279cd..a2bc1fa 100644 (file)
@@ -991,9 +991,7 @@ struct obd_device {
         struct list_head                 obd_delayed_reply_queue;
         time_t                           obd_recovery_start; /* seconds */
         time_t                           obd_recovery_end; /* seconds, for lprocfs_status */
-#ifdef CRAY_XT3
-        time_t                           obd_recovery_max_time; /* seconds, bz13079 */
-#endif
+        time_t                           obd_recovery_time_hard;
         int                              obd_recovery_timeout;
 
         union {
index 6532294..b50e3bb 100644 (file)
@@ -80,12 +80,11 @@ extern unsigned int obd_alloc_fail_rate;
 #else
 #define STALE_EXPORT_MAXTIME_DEFAULT    (0) /**< zero if no delayed recovery */
 #endif
-#ifdef CRAY_XT3
- #define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
-#endif
-/* Time to wait for all clients to reconnect during recovery */
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD          (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
 /* Should be very conservative; must catch the first reconnect after reboot */
-#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */
+#define OBD_RECOVERY_TIME_SOFT          (obd_timeout * 3)
 /* Change recovery-small 26b time if you change this */
 #define PING_INTERVAL max(obd_timeout / 4, 1U)
 /* a bit more than maximal journal commit time in seconds */
index 0b305b9..3d0b72e 100644 (file)
@@ -612,7 +612,7 @@ int target_recovery_check_and_stop(struct obd_device *obd)
         obd->obd_version_recov = 1;
         spin_unlock_bh(&obd->obd_processing_task_lock);
         /* reset timer, recovery will proceed with versions now */
-        reset_recovery_timer(obd, OBD_RECOVERY_FACTOR * obd_timeout, 1);
+        reset_recovery_timer(obd, OBD_RECOVERY_TIME_SOFT, 1);
         return 0;
 }
 EXPORT_SYMBOL(target_recovery_check_and_stop);
@@ -1339,15 +1339,11 @@ static void reset_recovery_timer(struct obd_device *obd, int duration,
         else if (!extend && (duration > obd->obd_recovery_timeout))
                 /* Track the client's largest expected replay time */
                 obd->obd_recovery_timeout = duration;
-#ifdef CRAY_XT3
-        /*
-         * If total recovery time already exceed the
-         * obd_recovery_max_time, then CRAY XT3 will
-         * abort the recovery
-         */
-        if(obd->obd_recovery_timeout > obd->obd_recovery_max_time)
-                obd->obd_recovery_timeout = obd->obd_recovery_max_time;
-#endif
+
+        /* Hard limit of obd_recovery_time_hard which should not happen */
+        if(obd->obd_recovery_timeout > obd->obd_recovery_time_hard)
+                obd->obd_recovery_timeout = obd->obd_recovery_time_hard;
+
         obd->obd_recovery_end = obd->obd_recovery_start +
                                 obd->obd_recovery_timeout;
         if (cfs_time_before(now, obd->obd_recovery_end)) {
@@ -1369,8 +1365,6 @@ static void check_and_start_recovery_timer(struct obd_device *obd,
         }
         CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
-        /* minimum */
-        obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
         obd->obd_recovery_handler = handler;
         cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
         spin_unlock_bh(&obd->obd_processing_task_lock);
index c69228e..97e6c38 100644 (file)
@@ -463,6 +463,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
         struct mds_obd *mds = &obd->u.mds;
         struct lr_server_data *lsd;
         struct lsd_client_data *lcd = NULL;
+        struct lustre_mount_info *lmi;
         loff_t off = 0;
         unsigned long last_rcvd_size = i_size_read(file->f_dentry->d_inode);
         __u64 mount_count;
@@ -686,16 +687,28 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                 obd->obd_recovering = 1;
                 obd->obd_recovery_start = 0;
                 obd->obd_recovery_end = 0;
-                obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
-#ifdef CRAY_XT3
-                /* bz13079: this won't be changed for mds */
-                obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
-#endif
         } else {
                 LASSERT(!obd->obd_recovering);
                 /* VBR: update boot epoch after recovery */
                 mds_update_last_epoch(obd);
         }
+
+        obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT;
+        obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD;
+
+        lmi = server_find_mount_locked(obd->obd_name);
+        if (lmi) {
+                struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft)
+                        obd->obd_recovery_timeout =
+                                lsi->lsi_lmd->lmd_recovery_time_soft;
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard)
+                        obd->obd_recovery_time_hard =
+                                lsi->lsi_lmd->lmd_recovery_time_hard;
+        }
+
         mds->mds_mount_count = mount_count + 1;
         lsd->lsd_mount_count = lsd->lsd_compat14 =
                 cpu_to_le64(mds->mds_mount_count);
index d2a5dcf..1f99bf9 100644 (file)
@@ -2111,20 +2111,46 @@ int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
 }
 EXPORT_SYMBOL(lprocfs_obd_rd_hash);
 
-#ifdef CRAY_XT3
-int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data)
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%d\n",
+                        obd->obd_recovery_timeout);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft);
+
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        obd->obd_recovery_timeout = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft);
+
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
 {
         struct obd_device *obd = (struct obd_device *)data;
         LASSERT(obd != NULL);
 
         return snprintf(page, count, "%lu\n",
-                        obd->obd_recovery_max_time);
+                        obd->obd_recovery_time_hard);
 }
-EXPORT_SYMBOL(lprocfs_obd_rd_recovery_maxtime);
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard);
 
-int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
-                                    unsigned long count, void *data)
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
 {
         struct obd_device *obd = (struct obd_device *)data;
         int val, rc;
@@ -2134,11 +2160,10 @@ int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
         if (rc)
                 return rc;
 
-        obd->obd_recovery_max_time = val;
+        obd->obd_recovery_time_hard = val;
         return count;
 }
-EXPORT_SYMBOL(lprocfs_obd_wr_recovery_maxtime);
-#endif /* CRAY_XT3 */
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard);
 
 #ifdef HAVE_DELAYED_RECOVERY
 int lprocfs_obd_rd_stale_export_age(char *page, char **start, off_t off,
index 4d8f3c0..0bc3729 100644 (file)
@@ -78,6 +78,17 @@ static struct lustre_mount_info *server_find_mount(char *name)
         RETURN(NULL);
 }
 
+struct lustre_mount_info *server_find_mount_locked(char *name)
+{
+        struct lustre_mount_info *lmi;
+
+        down(&lustre_mount_info_lock);
+        lmi = server_find_mount(name);
+        up(&lustre_mount_info_lock);
+
+        return lmi;
+}
+
 /* we must register an obd for a mount before we call the setup routine.
    *_setup will call lustre_get_mount to get the mnt struct
    by obd_name, since we can't pass the pointer to setup. */
@@ -228,7 +239,6 @@ int server_put_mount(char *name, struct vfsmount *mnt)
         RETURN(0);
 }
 
-
 /******* mount helper utilities *********/
 
 static void ldd_print(struct lustre_disk_data *ldd)
@@ -1160,6 +1170,8 @@ struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
         }
 
         lsi->lsi_lmd->lmd_exclude_count = 0;
+        lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+        lsi->lsi_lmd->lmd_recovery_time_hard = 0;
         s2lsi_nocast(sb) = lsi;
         /* we take 1 extra ref for our setup */
         atomic_set(&lsi->lsi_mounts, 1);
@@ -1717,8 +1729,18 @@ static void lmd_print(struct lustre_mount_data *lmd)
                 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
         PRINT_CMD(PRINT_MASK, "device:  %s\n", lmd->lmd_dev);
         PRINT_CMD(PRINT_MASK, "flags:   %x\n", lmd->lmd_flags);
+
         if (lmd->lmd_opts)
                 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
+
+        if (lmd->lmd_recovery_time_soft)
+                PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
+                          lmd->lmd_recovery_time_soft);
+
+        if (lmd->lmd_recovery_time_hard)
+                PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
+                          lmd->lmd_recovery_time_hard);
+
         for (i = 0; i < lmd->lmd_exclude_count; i++) {
                 PRINT_CMD(PRINT_MASK, "exclude %d:  OST%04x\n", i,
                           lmd->lmd_exclude[i]);
@@ -1836,6 +1858,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
         s1 = options;
         while (*s1) {
                 int clear = 0;
+                int time_min = 2 * (CONNECTION_SWITCH_MAX +
+                               2 * INITIAL_CONNECT_TIMEOUT);
+
                 /* Skip whitespace and extra commas */
                 while (*s1 == ' ' || *s1 == ',')
                         s1++;
@@ -1848,6 +1873,14 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
                 if (strncmp(s1, "abort_recov", 11) == 0) {
                         lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
                         clear++;
+                } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+                        lmd->lmd_recovery_time_soft = max_t(int,
+                                simple_strtoul(s1 + 19, NULL, 10), time_min);
+                        clear++;
+                } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+                        lmd->lmd_recovery_time_hard = max_t(int,
+                                simple_strtoul(s1 + 19, NULL, 10), time_min);
+                        clear++;
                 } else if (strncmp(s1, "nosvc", 5) == 0) {
                         lmd->lmd_flags |= LMD_FLG_NOSVC;
                         clear++;
@@ -2073,6 +2106,7 @@ EXPORT_SYMBOL(lustre_register_kill_super_cb);
 EXPORT_SYMBOL(lustre_common_put_super);
 EXPORT_SYMBOL(lustre_process_log);
 EXPORT_SYMBOL(lustre_end_log);
+EXPORT_SYMBOL(server_find_mount_locked);
 EXPORT_SYMBOL(server_get_mount);
 EXPORT_SYMBOL(server_put_mount);
 EXPORT_SYMBOL(server_register_target);
index 500d220..e3fa98e 100644 (file)
@@ -795,6 +795,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
         struct lsd_client_data *lcd = NULL;
         struct inode *inode = filp->f_dentry->d_inode;
         unsigned long last_rcvd_size = i_size_read(inode);
+        struct lustre_mount_info *lmi;
         __u64 mount_count;
         __u32 start_epoch;
         int cl_idx;
@@ -1000,16 +1001,28 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                 obd->obd_recovering = 1;
                 obd->obd_recovery_start = 0;
                 obd->obd_recovery_end = 0;
-                obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
-#ifdef CRAY_XT3
-                /* b13079: this should be set to desired value for ost */
-                obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
-#endif
         } else {
                 LASSERT(!obd->obd_recovering);
                 /* VBR: update boot epoch after recovery */
                 filter_update_last_epoch(obd);
         }
+
+        obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT;
+        obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD;
+
+        lmi = server_find_mount_locked(obd->obd_name);
+        if (lmi) {
+                struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft)
+                        obd->obd_recovery_timeout =
+                                lsi->lsi_lmd->lmd_recovery_time_soft;
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard)
+                        obd->obd_recovery_time_hard =
+                                lsi->lsi_lmd->lmd_recovery_time_hard;
+        }
+
 out:
         filter->fo_mount_count = mount_count + 1;
         fsd->lsd_mount_count = cpu_to_le64(filter->fo_mount_count);
index 3978b09..44cfed1 100644 (file)
@@ -304,12 +304,12 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
         { "tot_dirty",    lprocfs_filter_rd_tot_dirty,   0, 0 },
         { "tot_pending",  lprocfs_filter_rd_tot_pending, 0, 0 },
         { "tot_granted",  lprocfs_filter_rd_tot_granted, 0, 0 },
-        { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 },
+        { "recovery_status",    lprocfs_obd_rd_recovery_status, 0, 0 },
+        { "recovery_time_soft", lprocfs_obd_rd_recovery_time_soft,
+                                lprocfs_obd_wr_recovery_time_soft, 0},
+        { "recovery_time_hard", lprocfs_obd_rd_recovery_time_hard,
+                                lprocfs_obd_wr_recovery_time_hard, 0},
         { "hash_stats",   lprocfs_obd_rd_hash,      0, 0 },
-#ifdef CRAY_XT3
-        { "recovery_maxtime", lprocfs_obd_rd_recovery_maxtime,
-                              lprocfs_obd_wr_recovery_maxtime, 0},
-#endif
         { "evict_client", 0, lprocfs_wr_evict_client, 0,
                                 &lprocfs_evict_client_fops},
         { "num_exports",  lprocfs_rd_num_exports,   0, 0 },