From: Alex Zhuravlev Date: Fri, 17 Feb 2023 08:00:20 +0000 (+0300) Subject: LU-16478 target: disconnected export X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=6a4687a4998a4e78fcfe5061c409af5459cc9e8a;p=fs%2Flustre-release.git LU-16478 target: disconnected export eviction can race with a reconnect and this in turn can lead to a leaked export reference prevent further umount - mdt_obd_reconnect() grabs a reference via nodemap_add_member(). call obd_disconnect() if such a case observed to balance obd_reconnect(). Lustre-change: https://review.whamcloud.com/50041 Lustre-commit: 654d5f3fa4df2a0f7275a6da0f050a18881f4f75 Signed-off-by: Alex Zhuravlev Change-Id: I3fd49429ef40ef391d58e042e091258dcb9add72 Reviewed-by: Andreas Dilger Reviewed-by: Sebastien Buisson Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/50427 Tested-by: jenkins Tested-by: Maloo --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index ae0c678..c3ade18 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -255,8 +255,9 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 #define OBD_FAIL_MDS_NO_LL_OPEN 0x171 #define OBD_FAIL_MDS_LL_BLOCK 0x172 -#define OBD_FAIL_MDS_LL_PCCRO 0x173 -#define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x174 +#define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x173 +#define OBD_FAIL_MDS_CONNECT_VS_EVICT 0x174 +#define OBD_FAIL_MDS_LL_PCCRO 0x17a /* CMD */ #define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 860e7fc..4bb4feb 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1112,6 +1112,7 @@ int target_handle_connect(struct ptlrpc_request *req) int size, tmpsize; lnet_nid_t *client_nid = NULL; struct ptlrpc_connection *pcon = NULL; + bool reconnected = false; ENTRY; @@ -1455,8 +1456,15 @@ dont_check_exports: new_mds_mds_conn = true; } } else { + if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_VS_EVICT)) { + class_export_get(export); + class_fail_export(export); + class_export_put(export); + } rc = obd_reconnect(req->rq_svc_thread->t_env, export, target, &cluuid, data, client_nid); + if (rc == 0) + reconnected = true; } if (rc) GOTO(out, rc); @@ -1499,6 +1507,15 @@ dont_check_exports: if (export->exp_disconnected) { spin_unlock(&export->exp_lock); + if (reconnected) { + /* + * for each connect called disconnect + * should be called to cleanup stuff + */ + class_export_get(export); + obd_disconnect(export); + } + GOTO(out, rc = -ENODEV); } if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) { diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index eebde9f..9298c3f 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -1120,6 +1120,12 @@ int tgt_obd_ping(struct tgt_session_info *tsi) if (rc) RETURN(err_serious(rc)); + if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_VS_EVICT)) { + if (strstr(tsi->tsi_exp->exp_obd->obd_name, "MDT0000") && + (exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_MDS_MDS)) + tsi->tsi_pill->rc_req->rq_no_reply = 1; + } + RETURN(rc); } EXPORT_SYMBOL(tgt_obd_ping); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 67731b3..94dc756 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3302,6 +3302,20 @@ test_152() { } run_test 152 "QoS object allocation could be awakened in case of OST failover" +test_153() { +#define OBD_FAIL_MDS_CONNECT_VS_EVICT 0x174 + do_facet mds1 "$LCTL set_param fail_loc=0x174" + # first drop ping reply from MDS and then + # evict on the subsequent reconnect + # (see target_handle_connect) + sleep $((TIMEOUT + 3)) + stop mds1 + do_facet mds1 "$LCTL set_param fail_loc=0" + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || + error "Fail to start $SINGLEMDS" +} +run_test 153 "evict vs reconnect race" + complete $SECONDS check_and_cleanup_lustre exit_status