Whamcloud - gitweb
LU-10045 obdclass: multiple try when register target 61/30761/7
authorFan Yong <fan.yong@intel.com>
Thu, 11 Jan 2018 15:27:19 +0000 (23:27 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 31 Jan 2018 05:51:40 +0000 (05:51 +0000)
It is possible that the connection between MGC and MGS has not
been established when register target to MGS for server mount.
At that time, the ptlrpcd may be trying to (re-)connect to MGS
at background. Under such case, the mount process should not
report failure (-ESHUTDOWN -r -EIO), instead, it can retry the
MGS_TARGET_REG RPC after sometime (such as 2 seconds).

Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: I44e53a9d1de037907bdb5148b8c44d332439a50c
Reviewed-on: https://review.whamcloud.com/30761
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Faccini Bruno <bruno.faccini@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/obdclass/obd_mount_server.c

index cbe627e..8441d5b 100644 (file)
@@ -1187,6 +1187,7 @@ static int server_register_target(struct lustre_sb_info *lsi)
        struct mgs_target_info *mti = NULL;
        bool writeconf;
        int rc;
+       int tried = 0;
        ENTRY;
 
        LASSERT(mgc);
@@ -1211,6 +1212,7 @@ static int server_register_target(struct lustre_sb_info *lsi)
        writeconf = !!(lsi->lsi_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE));
        mti->mti_flags |= LDD_F_OPC_REG;
 
+again:
        /* Register the target */
        /* FIXME use mgc_process_config instead */
        rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp,
@@ -1224,6 +1226,17 @@ static int server_register_target(struct lustre_sb_info *lsi)
                                "to start: rc = %d. Please see messages on "
                                "the MGS.\n", lsi->lsi_svname, rc);
                } else if (writeconf) {
+                       if ((rc == -ESHUTDOWN || rc == -EIO) && ++tried < 5) {
+                               /* The connection with MGS is not established.
+                                * Try again after 2 seconds. Interruptable. */
+                               set_current_state(TASK_INTERRUPTIBLE);
+                               schedule_timeout(
+                                       msecs_to_jiffies(MSEC_PER_SEC) * 2);
+                               set_current_state(TASK_RUNNING);
+                               if (!signal_pending(current))
+                                       goto again;
+                       }
+
                        LCONSOLE_ERROR_MSG(0x15f,
                                "%s: cannot register this server with the MGS: "
                                "rc = %d. Is the MGS running?\n",