From 274d866f9d09e4ca12e7ac517b856ef9461002a9 Mon Sep 17 00:00:00 2001 From: Andriy Skulysh Date: Mon, 8 Oct 2018 18:43:20 +0300 Subject: [PATCH] LU-11701 ptlrpc: eviction of lwp connection aborts requests Lwp connection isn't replayed but ongoing request should be resent. Modify recovery-small/106 to check for LUSTRE_IMP_RECOVER state instead of LUSTRE_IMP_EVICTED. Change-Id: I50cf85405de588de8499d8fad8a4fe30923f348e Cray-bug-id: LUS-6509 Signed-off-by: Andriy Skulysh Reviewed-by: Alexander Boyko Reviewed-by: Vladimir Saveliev Reviewed-on: https://review.whamcloud.com/33719 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Alexandr Boyko Reviewed-by: Oleg Drokin --- lustre/ptlrpc/import.c | 11 ++++++++++- lustre/target/tgt_handler.c | 3 ++- lustre/tests/recovery-small.sh | 8 ++++---- lustre/tests/replay-single.sh | 34 ++++++++++++++++++++++++++++++++++ lustre/tests/test-framework.sh | 2 +- 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 6bb1666..0ec9bc3 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -1228,7 +1228,16 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, imp->imp_last_replay_transno = 0; imp->imp_replay_cursor = &imp->imp_committed_list; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); - } else { + } else if ((ocd->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0 && + !imp->imp_invalid) { + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); + DEBUG_REQ(D_HA, request, "%s: lwp recover", + imp->imp_obd->obd_name); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + } else { DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags" " not set: %x)", imp->imp_obd->obd_name, msg_flags); imp->imp_remote_handle = diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index 64c0052..6db1270 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -799,7 +799,8 @@ int tgt_request_handle(struct ptlrpc_request *req) LASSERTF(h->th_opc == opc, "opcode mismatch %d != %d\n", h->th_opc, opc); - if (CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE)) + if ((cfs_fail_val == 0 || cfs_fail_val == opc) && + CFS_FAIL_CHECK_ORSET(request_fail_id, CFS_FAIL_ONCE)) GOTO(out, rc = 0); rc = lustre_msg_check_version(msg, h->th_version); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 12ae698..4ae6503 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -1965,17 +1965,17 @@ test_106() { # LU-1789 touch $DIR2/$tfile || error "failed to create empty file" replay_barrier $SINGLEMDS - $LCTL set_param debug=console + $LCTL set_param debug=ha $LCTL clear facet_failover $SINGLEMDS - # lightweight connection must be evicted + # lightweight goes through LUSTRE_IMP_RECOVER during failover touch -c $DIR2/$tfile || true $LCTL dk $TMP/lustre-log-$TESTNAME.log - evicted=`awk '/This client was evicted by .*MDT0000/ { + recovered=`awk '/MDT0000-mdc-[0-9a-f]*: lwp recover/ { print; }' $TMP/lustre-log-$TESTNAME.log` - [ -z "$evicted" ] && error "lightweight client not evicted by mds" + [ -z "$recovered" ] && error "lightweight client was not recovered" # and all operations performed by lightweight client should be # synchronous, so the file created before mds restart should be there diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 232ba70..41909dc 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -4876,6 +4876,40 @@ test_132a() { } run_test 132a "PFL new component instantiate replay" +test_133() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + local remote_dir=$DIR/$tdir/remote_dir + + mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed" + $LFS mkdir -i 1 $remote_dir + + umount $MOUNT + do_facet mds2 $LCTL set_param seq.srv*MDT0001.space=clear + + zconf_mount $(hostname) $MOUNT + client_up || return 1 + + #define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123 + # SEQ_QUERY = 700 + do_facet mds1 $LCTL set_param fail_val=700 fail_loc=0x80000123 + cp /etc/hosts $remote_dir/file & + local pid=$! + sleep 1 + + fail_nodf mds1 + + wait $pid || error "cp failed" + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 133 "check resend of ongoing requests for lwp during failover" + complete $SECONDS check_and_cleanup_lustre exit_status diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 2da5222..00997a7 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -5531,7 +5531,7 @@ at_max_set() { drop_request() { # OBD_FAIL_MDS_ALL_REQUEST_NET RC=0 - do_facet $SINGLEMDS lctl set_param fail_loc=0x123 + do_facet $SINGLEMDS lctl set_param fail_val=0 fail_loc=0x123 do_facet client "$1" || RC=$? do_facet $SINGLEMDS lctl set_param fail_loc=0 return $RC -- 1.8.3.1