From 0814d5077343953115f50982a2e93cebb29bda68 Mon Sep 17 00:00:00 2001 From: Hongchao Zhang Date: Thu, 24 May 2018 16:09:27 -0400 Subject: [PATCH] LU-11056 lwp: fix lwp reconnection issue After the OST or MDT was restarted, the lwp reconnection can be failed for -EALREADY because the connect count in the connecttion request is less then the value saved in the corresponding export at MDT0000, which could cause the system hang. The patch also changes lustre_lwp_connect to use OBD_CONNECT_MDS_MDS flag only when the connection is between MDTs. Change-Id: I9ae7b4faadc65fdaa78458a06315b1739d144feb Signed-off-by: Hongchao Zhang Reviewed-on: https://review.whamcloud.com/32536 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_lib.c | 26 ++++++++++++++------------ lustre/obdclass/obd_mount_server.c | 17 ++++++++++------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 3a9c273..6a5daf7 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1161,6 +1161,7 @@ int target_handle_connect(struct ptlrpc_request *req) * cause namespace inconsistency */ spin_lock(&export->exp_lock); export->exp_connecting = 1; + export->exp_conn_cnt = 0; spin_unlock(&export->exp_lock); conn.cookie = export->exp_handle.h_cookie; rc = EALREADY; @@ -1202,18 +1203,19 @@ no_export: target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), atomic_read(&export->exp_refcount)); - GOTO(out, rc = -EBUSY); - } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) { - if (!strstr(cluuid.uuid, "mdt")) - LCONSOLE_WARN("%s: Rejecting reconnect from the " - "known client %s (at %s) because it " - "is indicating it is a new client", - target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid)); - GOTO(out, rc = -EALREADY); - } else { - OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); - } + GOTO(out, rc = -EBUSY); + } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 && + rc != EALREADY) { + if (!strstr(cluuid.uuid, "mdt")) + LCONSOLE_WARN("%s: Rejecting reconnect from the " + "known client %s (at %s) because it " + "is indicating it is a new client", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -EALREADY); + } else { + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); + } if (rc < 0) { GOTO(out, rc); diff --git a/lustre/obdclass/obd_mount_server.c b/lustre/obdclass/obd_mount_server.c index a4f0e4b..18b3e17 100644 --- a/lustre/obdclass/obd_mount_server.c +++ b/lustre/obdclass/obd_mount_server.c @@ -556,7 +556,7 @@ again: } EXPORT_SYMBOL(lustre_notify_lwp_list); -static int lustre_lwp_connect(struct obd_device *lwp) +static int lustre_lwp_connect(struct obd_device *lwp, bool is_mdt) { struct lu_env env; struct lu_context session_ctx; @@ -582,11 +582,14 @@ static int lustre_lwp_connect(struct obd_device *lwp) data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX; data->ocd_version = LUSTRE_VERSION_CODE; - data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | - OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE | - OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LFSCK | - OBD_CONNECT_BULK_MBITS; + data->ocd_connect_flags |= OBD_CONNECT_FID | OBD_CONNECT_AT | + OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 | + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LIGHTWEIGHT | + OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS; + + if (is_mdt) + data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS; + OBD_ALLOC_PTR(uuid); if (uuid == NULL) GOTO(out, rc = -ENOMEM); @@ -670,7 +673,7 @@ static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi, obd = class_name2obd(lwpname); LASSERT(obd != NULL); - rc = lustre_lwp_connect(obd); + rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL); if (rc == 0) { obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE; spin_lock(&lsi->lsi_lwp_lock); -- 1.8.3.1