From: Alex Zhuravlev Date: Fri, 17 Feb 2023 08:00:20 +0000 (+0300) Subject: LU-16478 target: disconnected export X-Git-Tag: 2.15.55~51 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=654d5f3fa4df2a0f7275a6da0f050a18881f4f75;p=fs%2Flustre-release.git LU-16478 target: disconnected export eviction can race with a reconnect and this in turn can lead to a leaked export reference prevent further umount - mdt_obd_reconnect() grabs a reference via nodemap_add_member(). call obd_disconnect() if such a case observed to balance obd_reconnect(). Signed-off-by: Alex Zhuravlev Change-Id: I3fd49429ef40ef391d58e042e091258dcb9add72 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50041 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Sebastien Buisson Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 2f5d507..6343086 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -257,6 +257,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_NO_LL_OPEN 0x171 #define OBD_FAIL_MDS_LL_BLOCK 0x172 #define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x173 +#define OBD_FAIL_MDS_CONNECT_VS_EVICT 0x174 /* CMD */ #define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index d012eea..850d2d3 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1088,6 +1088,7 @@ int target_handle_connect(struct ptlrpc_request *req) int tmp_exp_old_falloc; #endif struct ptlrpc_connection *pcon = NULL; + bool reconnected = false; ENTRY; @@ -1443,8 +1444,15 @@ dont_check_exports: new_mds_mds_conn = true; } } else { + if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_VS_EVICT)) { + class_export_get(export); + class_fail_export(export); + class_export_put(export); + } rc = obd_reconnect(req->rq_svc_thread->t_env, export, target, &cluuid, data, &client_nid); + if (rc == 0) + reconnected = true; } if (rc) GOTO(out, rc); @@ -1487,6 +1495,15 @@ dont_check_exports: if (export->exp_disconnected) { spin_unlock(&export->exp_lock); + if (reconnected) { + /* + * for each connect called disconnect + * should be called to cleanup stuff + */ + class_export_get(export); + obd_disconnect(export); + } + GOTO(out, rc = -ENODEV); } if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) { diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index d665969..eed6089 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -1139,6 +1139,12 @@ int tgt_obd_ping(struct tgt_session_info *tsi) if (rc) RETURN(err_serious(rc)); + if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_VS_EVICT)) { + if (strstr(tsi->tsi_exp->exp_obd->obd_name, "MDT0000") && + (exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_MDS_MDS)) + tsi->tsi_pill->rc_req->rq_no_reply = 1; + } + RETURN(rc); } EXPORT_SYMBOL(tgt_obd_ping); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index c5e0ee3..dca043b 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3431,6 +3431,20 @@ test_152() { } run_test 152 "QoS object allocation could be awakened in case of OST failover" +test_153() { +#define OBD_FAIL_MDS_CONNECT_VS_EVICT 0x174 + do_facet mds1 "$LCTL set_param fail_loc=0x174" + # first drop ping reply from MDS and then + # evict on the subsequent reconnect + # (see target_handle_connect) + sleep $((TIMEOUT + 3)) + stop mds1 + do_facet mds1 "$LCTL set_param fail_loc=0" + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || + error "Fail to start $SINGLEMDS" +} +run_test 153 "evict vs reconnect race" + complete $SECONDS check_and_cleanup_lustre exit_status