Whamcloud - gitweb
LU-8351 ptlrpc: allow blocking asts to be delayed 65/21065/6
authorVladimir Saveliev <vladimir.saveliev@seagate.com>
Wed, 29 Jun 2016 13:10:24 +0000 (16:10 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Sat, 17 Dec 2016 05:36:02 +0000 (05:36 +0000)
ptlrpc_import_delay_req() refuses to delay blocking asts when import
is not in LUSTRE_IMP_FULL yet. That leads to client eviction assuming
that it failed to respond.

Allow delays for blocking asts being resent.

Signed-off-by: Vladimir Saveliev <vladimir.saveliev@seagate.com>
Seagate-bug-id: MRP-3500
Change-Id: I0e5cde9636afd48cc6cb565f586a59bc7ec01810
Reviewed-on: https://review.whamcloud.com/21065
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/ptlrpc/client.c
lustre/ptlrpc/recover.c
lustre/tests/recovery-small.sh

index 1541b2e..8d2e01b 100644 (file)
@@ -1186,7 +1186,7 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
                if (atomic_read(&imp->imp_inval_count) != 0) {
                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
                         *status = -EIO;
-                } else if (imp->imp_dlm_fake || req->rq_no_delay) {
+               } else if (req->rq_no_delay) {
                         *status = -EWOULDBLOCK;
                } else if (req->rq_allow_replay &&
                          (imp->imp_state == LUSTRE_IMP_REPLAY ||
index 02f79d0..9e92b66 100644 (file)
@@ -222,6 +222,7 @@ int ptlrpc_resend(struct obd_import *imp)
        }
        spin_unlock(&imp->imp_lock);
 
+       OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT, 2);
        RETURN(0);
 }
 
index 4f7421f..fb07a4a 100755 (executable)
@@ -282,6 +282,45 @@ test_10d() {
 }
 run_test 10d "test failed blocking ast"
 
+test_10e()
+{
+       [[ $(lustre_version_code ost1) -le $(version_code 2.8.58) ]] &&
+               skip "Need OST version at least 2.8.59" && return 0
+       [ $CLIENTCOUNT -lt 2 ] && skip "need two clients" && return 0
+       [ $(facet_host client) == $(facet_host ost1) ] &&
+               skip "need ost1 and client on different nodes" && return 0
+       local -a clients=(${CLIENTS//,/ })
+       local client1=${clients[0]}
+       local client2=${clients[1]}
+
+       $LFS setstripe -c 1 -i 0 $DIR/$tfile-1 $DIR/$tfile-2
+       $MULTIOP $DIR/$tfile-1 Ow1048576c
+
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET                   0x305
+       $LCTL set_param fail_loc=0x80000305
+
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+       do_facet ost1 "$LCTL set_param fail_loc=0x1000030e"
+       # hit OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT twice:
+       # 1. to return ENOTCONN from ldlm_handle_enqueue0
+       # 2. to pause reconnect handling between resend and setting
+       # import to LUSTRE_IMP_FULL state
+       do_facet ost1 "$LCTL set_param fail_val=3"
+
+       # client1 fails ro respond to bl ast
+       do_node $client2 "$MULTIOP $DIR/$tfile-1 Ow1048576c" &
+       MULTIPID=$!
+
+       # ost1 returns error on enqueue, which causes client1 to reconnect
+       do_node $client1 "$MULTIOP $DIR/$tfile-2 Ow1048576c" ||
+               error "multiop failed"
+       wait $MULTIPID
+
+       do_facet ost1 "$LCTL set_param fail_loc=0"
+       do_facet ost1 "$LCTL set_param fail_val=0"
+}
+run_test 10e "re-send BL AST vs reconnect race 2"
+
 #bug 2460
 # wake up a thread waiting for completion after eviction
 test_11(){