From f9f2325146cee63ff1501479f720f9711538122a Mon Sep 17 00:00:00 2001 From: Hongchao Zhang Date: Wed, 26 Dec 2018 12:22:27 -0500 Subject: [PATCH] LU-11056 lwp: fix lwp reconnection issue After the OST or MDT was restarted, the lwp reconnection can be failed for -EALREADY because the connect count in the connecttion request is less then the value saved in the corresponding export at MDT0000, which could cause the system hang. The patch also changes lustre_lwp_connect to use OBD_CONNECT_MDS_MDS flag only when the connection is between MDTs. Lustre-change: https://review.whamcloud.com/32536 Lustre-commit: 0814d5077343953115f50982a2e93cebb29bda68 Change-Id: I9ae7b4faadc65fdaa78458a06315b1739d144feb Signed-off-by: Hongchao Zhang Reviewed-by: Mike Pershin Reviewed-by: Andreas Dilger Signed-off-by: Minh Diep Reviewed-on: https://review.whamcloud.com/33977 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_lib.c | 26 ++++++++++++++------------ lustre/obdclass/obd_mount_server.c | 17 ++++++++++------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 3836f99..581dcd7 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1164,6 +1164,7 @@ int target_handle_connect(struct ptlrpc_request *req) * cause namespace inconsistency */ spin_lock(&export->exp_lock); export->exp_connecting = 1; + export->exp_conn_cnt = 0; spin_unlock(&export->exp_lock); conn.cookie = export->exp_handle.h_cookie; rc = EALREADY; @@ -1205,18 +1206,19 @@ no_export: target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), atomic_read(&export->exp_refcount)); - GOTO(out, rc = -EBUSY); - } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) { - if (!strstr(cluuid.uuid, "mdt")) - LCONSOLE_WARN("%s: Rejecting reconnect from the " - "known client %s (at %s) because it " - "is indicating it is a new client", - target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid)); - GOTO(out, rc = -EALREADY); - } else { - OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); - } + GOTO(out, rc = -EBUSY); + } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 && + rc != EALREADY) { + if (!strstr(cluuid.uuid, "mdt")) + LCONSOLE_WARN("%s: Rejecting reconnect from the " + "known client %s (at %s) because it " + "is indicating it is a new client", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -EALREADY); + } else { + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); + } if (rc < 0) { GOTO(out, rc); diff --git a/lustre/obdclass/obd_mount_server.c b/lustre/obdclass/obd_mount_server.c index dc2d192..4469feb 100644 --- a/lustre/obdclass/obd_mount_server.c +++ b/lustre/obdclass/obd_mount_server.c @@ -559,7 +559,7 @@ again: } EXPORT_SYMBOL(lustre_notify_lwp_list); -static int lustre_lwp_connect(struct obd_device *lwp) +static int lustre_lwp_connect(struct obd_device *lwp, bool is_mdt) { struct lu_env env; struct lu_context session_ctx; @@ -585,11 +585,14 @@ static int lustre_lwp_connect(struct obd_device *lwp) data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX; data->ocd_version = LUSTRE_VERSION_CODE; - data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | - OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE | - OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LFSCK | - OBD_CONNECT_BULK_MBITS; + data->ocd_connect_flags |= OBD_CONNECT_FID | OBD_CONNECT_AT | + OBD_CONNECT_LRU_RESIZE | OBD_CONNECT_FULL20 | + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LIGHTWEIGHT | + OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS; + + if (is_mdt) + data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS; + OBD_ALLOC_PTR(uuid); if (uuid == NULL) GOTO(out, rc = -ENOMEM); @@ -673,7 +676,7 @@ static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi, obd = class_name2obd(lwpname); LASSERT(obd != NULL); - rc = lustre_lwp_connect(obd); + rc = lustre_lwp_connect(obd, strstr(lsi->lsi_svname, "-MDT") != NULL); if (rc == 0) { obd->u.cli.cl_max_mds_easize = MAX_MD_SIZE; spin_lock(&lsi->lsi_lwp_lock); -- 1.8.3.1