From bde948c9cf11793d8e7fef59ea2a10bb2d684bf6 Mon Sep 17 00:00:00 2001 From: Vladimir Saveliev Date: Wed, 29 Jun 2016 16:10:24 +0300 Subject: [PATCH] LU-8351 ptlrpc: allow blocking asts to be delayed ptlrpc_import_delay_req() refuses to delay blocking asts when import is not in LUSTRE_IMP_FULL yet. That leads to client eviction assuming that it failed to respond. Allow delays for blocking asts being resent. Signed-off-by: Vladimir Saveliev Seagate-bug-id: MRP-3500 Change-Id: I0e5cde9636afd48cc6cb565f586a59bc7ec01810 Reviewed-on: https://review.whamcloud.com/21065 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Bobi Jam Reviewed-by: John L. Hammond Reviewed-by: Oleg Drokin --- lustre/ptlrpc/client.c | 2 +- lustre/ptlrpc/recover.c | 1 + lustre/tests/recovery-small.sh | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 1541b2e..8d2e01b 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1186,7 +1186,7 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, if (atomic_read(&imp->imp_inval_count) != 0) { DEBUG_REQ(D_ERROR, req, "invalidate in flight"); *status = -EIO; - } else if (imp->imp_dlm_fake || req->rq_no_delay) { + } else if (req->rq_no_delay) { *status = -EWOULDBLOCK; } else if (req->rq_allow_replay && (imp->imp_state == LUSTRE_IMP_REPLAY || diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 02f79d0..9e92b66 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -222,6 +222,7 @@ int ptlrpc_resend(struct obd_import *imp) } spin_unlock(&imp->imp_lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT, 2); RETURN(0); } diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 4f7421f..fb07a4a 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -282,6 +282,45 @@ test_10d() { } run_test 10d "test failed blocking ast" +test_10e() +{ + [[ $(lustre_version_code ost1) -le $(version_code 2.8.58) ]] && + skip "Need OST version at least 2.8.59" && return 0 + [ $CLIENTCOUNT -lt 2 ] && skip "need two clients" && return 0 + [ $(facet_host client) == $(facet_host ost1) ] && + skip "need ost1 and client on different nodes" && return 0 + local -a clients=(${CLIENTS//,/ }) + local client1=${clients[0]} + local client2=${clients[1]} + + $LFS setstripe -c 1 -i 0 $DIR/$tfile-1 $DIR/$tfile-2 + $MULTIOP $DIR/$tfile-1 Ow1048576c + +#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 + $LCTL set_param fail_loc=0x80000305 + +#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e + do_facet ost1 "$LCTL set_param fail_loc=0x1000030e" + # hit OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT twice: + # 1. to return ENOTCONN from ldlm_handle_enqueue0 + # 2. to pause reconnect handling between resend and setting + # import to LUSTRE_IMP_FULL state + do_facet ost1 "$LCTL set_param fail_val=3" + + # client1 fails ro respond to bl ast + do_node $client2 "$MULTIOP $DIR/$tfile-1 Ow1048576c" & + MULTIPID=$! + + # ost1 returns error on enqueue, which causes client1 to reconnect + do_node $client1 "$MULTIOP $DIR/$tfile-2 Ow1048576c" || + error "multiop failed" + wait $MULTIPID + + do_facet ost1 "$LCTL set_param fail_loc=0" + do_facet ost1 "$LCTL set_param fail_val=0" +} +run_test 10e "re-send BL AST vs reconnect race 2" + #bug 2460 # wake up a thread waiting for completion after eviction test_11(){ -- 1.8.3.1