Whamcloud - gitweb
b=18948 Speedy recovery
authorMikhail Pershin <tappro@sun.com>
Fri, 16 Apr 2010 20:30:30 +0000 (13:30 -0700)
committerRobert Read <robert.read@oracle.com>
Fri, 16 Apr 2010 20:30:30 +0000 (13:30 -0700)
Add hard and soft time limits for server recovery.

i=andrew.perepechko
i=Hongchao.zhang

lustre/doc/mount.lustre.8
lustre/include/lprocfs_status.h
lustre/include/lustre_disk.h
lustre/include/obd.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/mdt/mdt_handler.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/obd_mount.c
lustre/obdfilter/filter.c
lustre/obdfilter/lproc_obdfilter.c

index 5e67b56..f250ad2 100644 (file)
@@ -107,6 +107,17 @@ Abort client recovery and start the target service immediately.
 .BI md_stripe_cache_size
 Sets the stripe cache size for server side disk with a striped raid
 configuration.
 .BI md_stripe_cache_size
 Sets the stripe cache size for server side disk with a striped raid
 configuration.
+.TP
+.BI recovery_time_soft= timeout
+Allow 'timeout' seconds for clients to reconnect for recovery after a server
+crash.  This timeout will be incrementally extended if it is about to expire
+and the server is still handling new connections from recoverable clients.
+The default soft recovery timeout is set to 300 seconds (5 minutes).
+.TP
+.BI recovery_time_hard= timeout
+The server will be allowed to incrementally extend its timeout up to a hard
+maximum of 'timeout' seconds.  The default hard recovery timeout is set to
+900 seconds (15 minutes).
 .SH EXAMPLES
 .TP
 .B mount -t lustre cfs21@tcp0:/testfs /mnt/myfilesystem
 .SH EXAMPLES
 .TP
 .B mount -t lustre cfs21@tcp0:/testfs /mnt/myfilesystem
index c525517..7157130 100644 (file)
@@ -650,13 +650,15 @@ struct file_operations name##_fops = {                                     \
 struct ptlrpc_request;
 extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
 
 struct ptlrpc_request;
 extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
 
-/* lprocfs_status.c: read recovery max time bz13079 */
-int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data);
-
-/* lprocfs_status.c: write recovery max time bz13079 */
-int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
-                                    unsigned long count, void *data);
+/* lproc_status.c */
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data);
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+                                      unsigned long count, void *data);
 
 /* all quota proc functions */
 extern int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count,
 
 /* all quota proc functions */
 extern int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count,
index 4cbc744..04caa81 100644 (file)
@@ -177,6 +177,8 @@ struct lustre_mount_data {
         __u32      lmd_flags;         /* lustre mount flags */
         int        lmd_mgs_failnodes; /* mgs failover node count */
         int        lmd_exclude_count;
         __u32      lmd_flags;         /* lustre mount flags */
         int        lmd_mgs_failnodes; /* mgs failover node count */
         int        lmd_exclude_count;
+        int        lmd_recovery_time_soft;
+        int        lmd_recovery_time_hard;
         char      *lmd_dev;           /* device name */
         char      *lmd_profile;       /* client only */
         char      *lmd_mgssec;        /* sptlrpc flavor to mgs */
         char      *lmd_dev;           /* device name */
         char      *lmd_profile;       /* client only */
         char      *lmd_mgssec;        /* sptlrpc flavor to mgs */
@@ -464,6 +466,7 @@ void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
 
 
 int lustre_common_put_super(struct super_block *sb);
 
 
 int lustre_common_put_super(struct super_block *sb);
+struct lustre_mount_info *server_find_mount_locked(const char *name);
 struct lustre_mount_info *server_get_mount(const char *name);
 struct lustre_mount_info *server_get_mount_2(const char *name);
 int server_put_mount(const char *name, struct vfsmount *mnt);
 struct lustre_mount_info *server_get_mount(const char *name);
 struct lustre_mount_info *server_get_mount_2(const char *name);
 int server_put_mount(const char *name, struct vfsmount *mnt);
index 81e86dc..9b3c538 100644 (file)
@@ -1077,7 +1077,7 @@ struct obd_device {
         cfs_timer_t                      obd_recovery_timer;
         time_t                           obd_recovery_start; /* seconds */
         time_t                           obd_recovery_end; /* seconds, for lprocfs_status */
         cfs_timer_t                      obd_recovery_timer;
         time_t                           obd_recovery_start; /* seconds */
         time_t                           obd_recovery_end; /* seconds, for lprocfs_status */
-        time_t                           obd_recovery_max_time; /* seconds, bz13079 */
+        time_t                           obd_recovery_time_hard;
         int                              obd_recovery_timeout;
 
         /* new recovery stuff from CMD2 */
         int                              obd_recovery_timeout;
 
         /* new recovery stuff from CMD2 */
index 87798f8..bd83911 100644 (file)
@@ -114,9 +114,11 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_TIMEOUT_DEFAULT             100
 #define LDLM_TIMEOUT_DEFAULT            20
 #define MDS_LDLM_TIMEOUT_DEFAULT        6
 #define OBD_TIMEOUT_DEFAULT             100
 #define LDLM_TIMEOUT_DEFAULT            20
 #define MDS_LDLM_TIMEOUT_DEFAULT        6
-/* Time to wait for all clients to reconnect during recovery */
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD          (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
 /* Should be very conservative; must catch the first reconnect after reboot */
 /* Should be very conservative; must catch the first reconnect after reboot */
-#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */
+#define OBD_RECOVERY_TIME_SOFT          (obd_timeout * 3)
 /* Change recovery-small 26b time if you change this */
 #define PING_INTERVAL max(obd_timeout / 4, 1U)
 /* Client may skip 1 ping; we must wait at least 2.5. But for multiple
 /* Change recovery-small 26b time if you change this */
 #define PING_INTERVAL max(obd_timeout / 4, 1U)
 /* Client may skip 1 ping; we must wait at least 2.5. But for multiple
index 3b4479b..50795ce 100644 (file)
@@ -1328,15 +1328,11 @@ static void reset_recovery_timer(struct obd_device *obd, int duration,
         else if (!extend && (duration > obd->obd_recovery_timeout))
                 /* Track the client's largest expected replay time */
                 obd->obd_recovery_timeout = duration;
         else if (!extend && (duration > obd->obd_recovery_timeout))
                 /* Track the client's largest expected replay time */
                 obd->obd_recovery_timeout = duration;
-#ifdef CRAY_XT3
-        /*
-         * If total recovery time already exceed the
-         * obd_recovery_max_time, then CRAY XT3 will
-         * abort the recovery
-         */
-        if(obd->obd_recovery_timeout > obd->obd_recovery_max_time)
-                obd->obd_recovery_timeout = obd->obd_recovery_max_time;
-#endif
+
+        /* Hard limit of obd_recovery_time_hard which should not happen */
+        if (obd->obd_recovery_timeout > obd->obd_recovery_time_hard)
+                obd->obd_recovery_timeout = obd->obd_recovery_time_hard;
+
         obd->obd_recovery_end = obd->obd_recovery_start +
                                 obd->obd_recovery_timeout;
         if (!cfs_timer_is_armed(&obd->obd_recovery_timer) ||
         obd->obd_recovery_end = obd->obd_recovery_start +
                                 obd->obd_recovery_timeout;
         if (!cfs_timer_is_armed(&obd->obd_recovery_timer) ||
@@ -1358,8 +1354,6 @@ static void check_and_start_recovery_timer(struct obd_device *obd)
         }
         CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
         }
         CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
-        /* minimum */
-        obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
         cfs_spin_unlock_bh(&obd->obd_processing_task_lock);
 
         reset_recovery_timer(obd, obd->obd_recovery_timeout, 0);
         cfs_spin_unlock_bh(&obd->obd_processing_task_lock);
 
         reset_recovery_timer(obd, obd->obd_recovery_timeout, 0);
@@ -1807,7 +1801,7 @@ static int target_recovery_thread(void *arg)
         delta = (jiffies - delta) / CFS_HZ;
         CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n",
               delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
         delta = (jiffies - delta) / CFS_HZ;
         CDEBUG(D_INFO,"4: recovery completed in %lus - %d/%d reqs/locks\n",
               delta, obd->obd_replayed_requests, obd->obd_replayed_locks);
-        if (delta > obd_timeout * OBD_RECOVERY_FACTOR) {
+        if (delta > OBD_RECOVERY_TIME_SOFT) {
                 CWARN("too long recovery - read logs\n");
                 libcfs_debug_dumplog();
         }
                 CWARN("too long recovery - read logs\n");
                 libcfs_debug_dumplog();
         }
@@ -1898,9 +1892,12 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
         obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
         obd->obd_recovery_start = 0;
         obd->obd_recovery_end = 0;
         obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
         obd->obd_recovery_start = 0;
         obd->obd_recovery_end = 0;
-        obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
-        /* bz13079: this should be set to desired value for ost but not for mds */
-        obd->obd_recovery_max_time = OBD_RECOVERY_MAX_TIME;
+
+        /* both values can be get from mount data already */
+        if (obd->obd_recovery_timeout == 0)
+                obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT;
+        if (obd->obd_recovery_time_hard == 0)
+                obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD;
         cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
         target_start_recovery_thread(lut, handler);
 }
         cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
         target_start_recovery_thread(lut, handler);
 }
index 63856b9..ab757f6 100644 (file)
@@ -4526,6 +4526,14 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
                         CERROR("CMD Operation not allowed in IOP mode\n");
                         GOTO(err_lmi, rc = -EINVAL);
                 }
                         CERROR("CMD Operation not allowed in IOP mode\n");
                         GOTO(err_lmi, rc = -EINVAL);
                 }
+                /* Read recovery timeouts */
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft)
+                        obd->obd_recovery_timeout =
+                                lsi->lsi_lmd->lmd_recovery_time_soft;
+
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard)
+                        obd->obd_recovery_time_hard =
+                                lsi->lsi_lmd->lmd_recovery_time_hard;
         }
 
         cfs_rwlock_init(&m->mdt_sptlrpc_lock);
         }
 
         cfs_rwlock_init(&m->mdt_sptlrpc_lock);
index e4b063e..8dfd79e 100644 (file)
@@ -2237,18 +2237,45 @@ out:
 }
 EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status);
 
 }
 EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status);
 
-int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
-                                    int count, int *eof, void *data)
+int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%d\n",
+                        obd->obd_recovery_timeout);
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft);
+
+int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        int val, rc;
+        LASSERT(obd != NULL);
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        obd->obd_recovery_timeout = val;
+        return count;
+}
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft);
+
+int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
 {
         struct obd_device *obd = data;
         LASSERT(obd != NULL);
 
 {
         struct obd_device *obd = data;
         LASSERT(obd != NULL);
 
-        return snprintf(page, count, "%lu\n", obd->obd_recovery_max_time);
+        return snprintf(page, count, "%lu\n", obd->obd_recovery_time_hard);
 }
 }
-EXPORT_SYMBOL(lprocfs_obd_rd_recovery_maxtime);
+EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard);
 
 
-int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
-                                    unsigned long count, void *data)
+int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer,
+                                      unsigned long count, void *data)
 {
         struct obd_device *obd = data;
         int val, rc;
 {
         struct obd_device *obd = data;
         int val, rc;
@@ -2258,11 +2285,10 @@ int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
         if (rc)
                 return rc;
 
         if (rc)
                 return rc;
 
-        obd->obd_recovery_max_time = val;
+        obd->obd_recovery_time_hard = val;
         return count;
 }
         return count;
 }
-EXPORT_SYMBOL(lprocfs_obd_wr_recovery_maxtime);
-
+EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard);
 
 EXPORT_SYMBOL(lprocfs_register);
 EXPORT_SYMBOL(lprocfs_srch);
 
 EXPORT_SYMBOL(lprocfs_register);
 EXPORT_SYMBOL(lprocfs_srch);
index b8c7e7d..3d211be 100644 (file)
@@ -1199,6 +1199,8 @@ struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
         }
 
         lsi->lsi_lmd->lmd_exclude_count = 0;
         }
 
         lsi->lsi_lmd->lmd_exclude_count = 0;
+        lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+        lsi->lsi_lmd->lmd_recovery_time_hard = 0;
         s2lsi_nocast(sb) = lsi;
         /* we take 1 extra ref for our setup */
         cfs_atomic_set(&lsi->lsi_mounts, 1);
         s2lsi_nocast(sb) = lsi;
         /* we take 1 extra ref for our setup */
         cfs_atomic_set(&lsi->lsi_mounts, 1);
@@ -1748,7 +1750,6 @@ int lustre_common_put_super(struct super_block *sb)
         RETURN(rc);
 }
 
         RETURN(rc);
 }
 
-#if 0
 static void lmd_print(struct lustre_mount_data *lmd)
 {
         int i;
 static void lmd_print(struct lustre_mount_data *lmd)
 {
         int i;
@@ -1758,14 +1759,23 @@ static void lmd_print(struct lustre_mount_data *lmd)
                 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
         PRINT_CMD(PRINT_MASK, "device:  %s\n", lmd->lmd_dev);
         PRINT_CMD(PRINT_MASK, "flags:   %x\n", lmd->lmd_flags);
                 PRINT_CMD(PRINT_MASK, "profile: %s\n", lmd->lmd_profile);
         PRINT_CMD(PRINT_MASK, "device:  %s\n", lmd->lmd_dev);
         PRINT_CMD(PRINT_MASK, "flags:   %x\n", lmd->lmd_flags);
+
         if (lmd->lmd_opts)
                 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
         if (lmd->lmd_opts)
                 PRINT_CMD(PRINT_MASK, "options: %s\n", lmd->lmd_opts);
+
+        if (lmd->lmd_recovery_time_soft)
+                PRINT_CMD(PRINT_MASK, "recovery time soft: %d\n",
+                          lmd->lmd_recovery_time_soft);
+
+        if (lmd->lmd_recovery_time_hard)
+                PRINT_CMD(PRINT_MASK, "recovery time hard: %d\n",
+                          lmd->lmd_recovery_time_hard);
+
         for (i = 0; i < lmd->lmd_exclude_count; i++) {
                 PRINT_CMD(PRINT_MASK, "exclude %d:  OST%04x\n", i,
                           lmd->lmd_exclude[i]);
         }
 }
         for (i = 0; i < lmd->lmd_exclude_count; i++) {
                 PRINT_CMD(PRINT_MASK, "exclude %d:  OST%04x\n", i,
                           lmd->lmd_exclude[i]);
         }
 }
-#endif
 
 /* Is this server on the exclusion list */
 int lustre_check_exclusion(struct super_block *sb, char *svname)
 
 /* Is this server on the exclusion list */
 int lustre_check_exclusion(struct super_block *sb, char *svname)
@@ -1902,6 +1912,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
         s1 = options;
         while (*s1) {
                 int clear = 0;
         s1 = options;
         while (*s1) {
                 int clear = 0;
+                int time_min = 2 * (CONNECTION_SWITCH_MAX +
+                               2 * INITIAL_CONNECT_TIMEOUT);
+
                 /* Skip whitespace and extra commas */
                 while (*s1 == ' ' || *s1 == ',')
                         s1++;
                 /* Skip whitespace and extra commas */
                 while (*s1 == ' ' || *s1 == ',')
                         s1++;
@@ -1914,6 +1927,14 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
                 if (strncmp(s1, "abort_recov", 11) == 0) {
                         lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
                         clear++;
                 if (strncmp(s1, "abort_recov", 11) == 0) {
                         lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
                         clear++;
+                } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+                        lmd->lmd_recovery_time_soft = max_t(int,
+                                simple_strtoul(s1 + 19, NULL, 10), time_min);
+                        clear++;
+                } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+                        lmd->lmd_recovery_time_hard = max_t(int,
+                                simple_strtoul(s1 + 19, NULL, 10), time_min);
+                        clear++;
                 } else if (strncmp(s1, "nosvc", 5) == 0) {
                         lmd->lmd_flags |= LMD_FLG_NOSVC;
                         clear++;
                 } else if (strncmp(s1, "nosvc", 5) == 0) {
                         lmd->lmd_flags |= LMD_FLG_NOSVC;
                         clear++;
@@ -1993,6 +2014,7 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd)
                 strcpy(lmd->lmd_opts, options);
         }
 
                 strcpy(lmd->lmd_opts, options);
         }
 
+        lmd_print(lmd);
         lmd->lmd_magic = LMD_MAGIC;
 
         RETURN(rc);
         lmd->lmd_magic = LMD_MAGIC;
 
         RETURN(rc);
index 4169e19..a88bd4a 100644 (file)
@@ -1988,6 +1988,14 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
                 struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
                 mnt = lmi->lmi_mnt;
                 obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
                 struct lustre_sb_info *lsi = s2lsi(lmi->lmi_sb);
                 mnt = lmi->lmi_mnt;
                 obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd));
+
+                /* gets recovery timeouts from mount data */
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_soft)
+                        obd->obd_recovery_timeout =
+                                lsi->lsi_lmd->lmd_recovery_time_soft;
+                if (lsi->lsi_lmd && lsi->lsi_lmd->lmd_recovery_time_hard)
+                        obd->obd_recovery_time_hard =
+                                lsi->lsi_lmd->lmd_recovery_time_hard;
         } else {
                 /* old path - used by lctl */
                 CERROR("Using old MDS mount method\n");
         } else {
                 /* old path - used by lctl */
                 CERROR("Using old MDS mount method\n");
index e49386e..90fec4e 100644 (file)
@@ -382,9 +382,11 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
         { "tot_pending",  lprocfs_filter_rd_tot_pending, 0, 0 },
         { "tot_granted",  lprocfs_filter_rd_tot_granted, 0, 0 },
         { "hash_stats",   lprocfs_obd_rd_hash,      0, 0 },
         { "tot_pending",  lprocfs_filter_rd_tot_pending, 0, 0 },
         { "tot_granted",  lprocfs_filter_rd_tot_granted, 0, 0 },
         { "hash_stats",   lprocfs_obd_rd_hash,      0, 0 },
-        { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 },
-        { "recovery_maxtime", lprocfs_obd_rd_recovery_maxtime,
-                              lprocfs_obd_wr_recovery_maxtime, 0},
+        { "recovery_status",    lprocfs_obd_rd_recovery_status, 0, 0 },
+        { "recovery_time_soft", lprocfs_obd_rd_recovery_time_soft,
+                                lprocfs_obd_wr_recovery_time_soft, 0},
+        { "recovery_time_hard", lprocfs_obd_rd_recovery_time_hard,
+                                lprocfs_obd_wr_recovery_time_hard, 0},
         { "evict_client", 0, lprocfs_wr_evict_client, 0,
                                 &lprocfs_evict_client_fops},
         { "num_exports",  lprocfs_rd_num_exports,   0, 0 },
         { "evict_client", 0, lprocfs_wr_evict_client, 0,
                                 &lprocfs_evict_client_fops},
         { "num_exports",  lprocfs_rd_num_exports,   0, 0 },