Whamcloud - gitweb
LU-11701 ptlrpc: eviction of lwp connection aborts requests 19/33719/6
authorAndriy Skulysh <c17819@cray.com>
Mon, 8 Oct 2018 15:43:20 +0000 (18:43 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 11 Feb 2019 03:22:15 +0000 (03:22 +0000)
Lwp connection isn't replayed but ongoing request
should be resent.
Modify recovery-small/106 to check for
LUSTRE_IMP_RECOVER state instead of LUSTRE_IMP_EVICTED.

Change-Id: I50cf85405de588de8499d8fad8a4fe30923f348e
Cray-bug-id: LUS-6509
Signed-off-by: Andriy Skulysh <c17819@cray.com>
Reviewed-by: Alexander Boyko <c17825@cray.com>
Reviewed-by: Vladimir Saveliev <c17830@cray.com>
Reviewed-on: https://review.whamcloud.com/33719
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ptlrpc/import.c
lustre/target/tgt_handler.c
lustre/tests/recovery-small.sh
lustre/tests/replay-single.sh
lustre/tests/test-framework.sh

index 6bb1666..0ec9bc3 100644 (file)
@@ -1228,7 +1228,16 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                 imp->imp_last_replay_transno = 0;
                imp->imp_replay_cursor = &imp->imp_committed_list;
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
-        } else {
+       } else if ((ocd->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0 &&
+                  !imp->imp_invalid) {
+
+               obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+               DEBUG_REQ(D_HA, request, "%s: lwp recover",
+                         imp->imp_obd->obd_name);
+               imp->imp_remote_handle =
+                       *lustre_msg_get_handle(request->rq_repmsg);
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+       } else {
                 DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
                           " not set: %x)", imp->imp_obd->obd_name, msg_flags);
                 imp->imp_remote_handle =
index 64c0052..6db1270 100644 (file)
@@ -799,7 +799,8 @@ int tgt_request_handle(struct ptlrpc_request *req)
        LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n",
                 h->th_opc, opc);
 
-       if (CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE))
+       if ((cfs_fail_val == 0 || cfs_fail_val == opc) &&
+            CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE))
                GOTO(out, rc = 0);
 
        rc = lustre_msg_check_version(msg, h->th_version);
index 12ae698..4ae6503 100755 (executable)
@@ -1965,17 +1965,17 @@ test_106() { # LU-1789
        touch $DIR2/$tfile || error "failed to create empty file"
        replay_barrier $SINGLEMDS
 
-       $LCTL set_param debug=console
+       $LCTL set_param debug=ha
        $LCTL clear
        facet_failover $SINGLEMDS
 
-       # lightweight connection must be evicted
+       # lightweight goes through LUSTRE_IMP_RECOVER during failover
        touch -c $DIR2/$tfile || true
        $LCTL dk $TMP/lustre-log-$TESTNAME.log
-       evicted=`awk '/This client was evicted by .*MDT0000/ {
+       recovered=`awk '/MDT0000-mdc-[0-9a-f]*: lwp recover/ {
                                      print;
                      }' $TMP/lustre-log-$TESTNAME.log`
-       [ -z "$evicted" ] && error "lightweight client not evicted by mds"
+       [ -z "$recovered" ] && error "lightweight client was not recovered"
 
        # and all operations performed by lightweight client should be
        # synchronous, so the file created before mds restart should be there
index 232ba70..41909dc 100755 (executable)
@@ -4876,6 +4876,40 @@ test_132a() {
 }
 run_test 132a "PFL new component instantiate replay"
 
+test_133() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+       ([ $FAILURE_MODE == "HARD" ] &&
+               [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+               skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+               return 0
+
+       local remote_dir=$DIR/$tdir/remote_dir
+
+       mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+       $LFS mkdir -i 1 $remote_dir
+
+       umount $MOUNT
+       do_facet mds2 $LCTL set_param seq.srv*MDT0001.space=clear
+
+       zconf_mount $(hostname) $MOUNT
+       client_up || return 1
+
+       #define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+       # SEQ_QUERY                       = 700
+       do_facet mds1 $LCTL set_param fail_val=700 fail_loc=0x80000123
+       cp /etc/hosts $remote_dir/file &
+       local pid=$!
+       sleep 1
+
+       fail_nodf mds1
+
+       wait $pid || error "cp failed"
+       rm -rf $DIR/$tdir || error "rmdir failed"
+
+       return 0
+}
+run_test 133 "check resend of ongoing requests for lwp during failover"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
index 2da5222..00997a7 100755 (executable)
@@ -5531,7 +5531,7 @@ at_max_set() {
 drop_request() {
 # OBD_FAIL_MDS_ALL_REQUEST_NET
     RC=0
-    do_facet $SINGLEMDS lctl set_param fail_loc=0x123
+    do_facet $SINGLEMDS lctl set_param fail_val=0 fail_loc=0x123
     do_facet client "$1" || RC=$?
     do_facet $SINGLEMDS lctl set_param fail_loc=0
     return $RC