Whamcloud - gitweb
LU-11056 lwp: fix lwp reconnection issue 36/32536/6
authorHongchao Zhang <hongchao.zhang@intel.com>
Thu, 24 May 2018 20:09:27 +0000 (16:09 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 28 Aug 2018 05:14:01 +0000 (05:14 +0000)
After the OST or MDT was restarted, the lwp reconnection can be
failed for -EALREADY because the connect count in the connecttion
request is less then the value saved in the corresponding export
at MDT0000, which could cause the system hang.

The patch also changes lustre_lwp_connect to use OBD_CONNECT_MDS_MDS
flag only when the connection is between MDTs.

Change-Id: I9ae7b4faadc65fdaa78458a06315b1739d144feb
Signed-off-by: Hongchao Zhang <hongchao.zhang@intel.com>
Reviewed-on: https://review.whamcloud.com/32536
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ldlm/ldlm_lib.c
lustre/obdclass/obd_mount_server.c

index 3a9c273..6a5daf7 100644 (file)
@@ -1161,6 +1161,7 @@ int target_handle_connect(struct ptlrpc_request *req)
                         * cause namespace inconsistency */
                        spin_lock(&export->exp_lock);
                        export->exp_connecting = 1;
+                       export->exp_conn_cnt = 0;
                        spin_unlock(&export->exp_lock);
                        conn.cookie = export->exp_handle.h_cookie;
                        rc = EALREADY;
@@ -1202,18 +1203,19 @@ no_export:
                               target->obd_name, cluuid.uuid,
                               libcfs_nid2str(req->rq_peer.nid),
                              atomic_read(&export->exp_refcount));
-                GOTO(out, rc = -EBUSY);
-        } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
-                if (!strstr(cluuid.uuid, "mdt"))
-                        LCONSOLE_WARN("%s: Rejecting reconnect from the "
-                                      "known client %s (at %s) because it "
-                                      "is indicating it is a new client",
-                                      target->obd_name, cluuid.uuid,
-                                      libcfs_nid2str(req->rq_peer.nid));
-                GOTO(out, rc = -EALREADY);
-        } else {
-                OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
-        }
+                       GOTO(out, rc = -EBUSY);
+       } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 &&
+                  rc != EALREADY) {
+               if (!strstr(cluuid.uuid, "mdt"))
+                       LCONSOLE_WARN("%s: Rejecting reconnect from the "
+                                     "known client %s (at %s) because it "
+                                     "is indicating it is a new client",
+                                     target->obd_name, cluuid.uuid,
+                                     libcfs_nid2str(req->rq_peer.nid));
+               GOTO(out, rc = -EALREADY);
+       } else {
+               OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout);
+       }
 
         if (rc < 0) {
                 GOTO(out, rc);
index a4f0e4b..18b3e17 100644 (file)
@@ -556,7 +556,7 @@ again:
 }
 EXPORT_SYMBOL(lustre_notify_lwp_list);
 
-static int lustre_lwp_connect(struct obd_device *lwp)
+static int lustre_lwp_connect(struct obd_device *lwp, bool is_mdt)
 {
        struct lu_env            env;
        struct lu_context        session_ctx;
@@ -582,11 +582,14 @@ static int lustre_lwp_connect(struct obd_device *lwp)
 
        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
        data->ocd_version = LUSTRE_VERSION_CODE;
-       data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID |
-               OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE |
-               OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE |
-               OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LFSCK |
-               OBD_CONNECT_BULK_MBITS;
+       data->ocd_connect_flags |= OBD_CONNECT_FID | OBD_CONNECT_AT |
+               OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 |
+               OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LIGHTWEIGHT |
+               OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS;
+
+       if (is_mdt)
+               data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS;
+
        OBD_ALLOC_PTR(uuid);
        if (uuid == NULL)
                GOTO(out, rc = -ENOMEM);
@@ -670,7 +673,7 @@ static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi,
        obd = class_name2obd(lwpname);
        LASSERT(obd != NULL);
 
-       rc = lustre_lwp_connect(obd);
+       rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL);
        if (rc == 0) {
                obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE;
                spin_lock(&lsi->lsi_lwp_lock);