Whamcloud - gitweb
LU-16478 target: disconnected export 41/50041/7
authorAlex Zhuravlev <bzzz@whamcloud.com>
Fri, 17 Feb 2023 08:00:20 +0000 (11:00 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 21 Mar 2023 23:36:14 +0000 (23:36 +0000)
eviction can race with a reconnect and this in turn can lead
to a leaked export reference prevent further umount -
mdt_obd_reconnect() grabs a reference via nodemap_add_member().
call obd_disconnect() if such a case observed to balance
obd_reconnect().

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I3fd49429ef40ef391d58e042e091258dcb9add72
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50041
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/target/tgt_handler.c
lustre/tests/recovery-small.sh

index 2f5d507..6343086 100644 (file)
@@ -257,6 +257,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_NO_LL_OPEN                 0x171
 #define OBD_FAIL_MDS_LL_BLOCK           0x172
 #define OBD_FAIL_MDS_LOD_CREATE_PAUSE   0x173
+#define OBD_FAIL_MDS_CONNECT_VS_EVICT   0x174
 
 /* CMD */
 #define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
index d012eea..850d2d3 100644 (file)
@@ -1088,6 +1088,7 @@ int target_handle_connect(struct ptlrpc_request *req)
        int tmp_exp_old_falloc;
 #endif
        struct ptlrpc_connection *pcon = NULL;
+       bool reconnected = false;
 
        ENTRY;
 
@@ -1443,8 +1444,15 @@ dont_check_exports:
                                new_mds_mds_conn = true;
                }
        } else {
+               if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_VS_EVICT)) {
+                       class_export_get(export);
+                       class_fail_export(export);
+                       class_export_put(export);
+               }
                rc = obd_reconnect(req->rq_svc_thread->t_env,
                                   export, target, &cluuid, data, &client_nid);
+               if (rc == 0)
+                       reconnected = true;
        }
        if (rc)
                GOTO(out, rc);
@@ -1487,6 +1495,15 @@ dont_check_exports:
 
        if (export->exp_disconnected) {
                spin_unlock(&export->exp_lock);
+               if (reconnected) {
+                       /*
+                        * for each connect called disconnect
+                        * should be called to cleanup stuff
+                        */
+                       class_export_get(export);
+                       obd_disconnect(export);
+               }
+
                GOTO(out, rc = -ENODEV);
        }
        if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) {
index d665969..eed6089 100644 (file)
@@ -1139,6 +1139,12 @@ int tgt_obd_ping(struct tgt_session_info *tsi)
        if (rc)
                RETURN(err_serious(rc));
 
+       if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_VS_EVICT)) {
+               if (strstr(tsi->tsi_exp->exp_obd->obd_name, "MDT0000") &&
+                   (exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_MDS_MDS))
+                       tsi->tsi_pill->rc_req->rq_no_reply = 1;
+       }
+
        RETURN(rc);
 }
 EXPORT_SYMBOL(tgt_obd_ping);
index c5e0ee3..dca043b 100755 (executable)
@@ -3431,6 +3431,20 @@ test_152() {
 }
 run_test 152 "QoS object allocation could be awakened in case of OST failover"
 
+test_153() {
+#define OBD_FAIL_MDS_CONNECT_VS_EVICT   0x174
+       do_facet mds1 "$LCTL set_param fail_loc=0x174"
+       # first drop ping reply from MDS and then
+       # evict on the subsequent reconnect
+       # (see target_handle_connect)
+       sleep $((TIMEOUT + 3))
+       stop mds1
+       do_facet mds1 "$LCTL set_param fail_loc=0"
+       start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS ||
+               error "Fail to start $SINGLEMDS"
+}
+run_test 153 "evict vs reconnect race"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status