From 71f8e5d6506fb36d56d03024f68da23de84d92d3 Mon Sep 17 00:00:00 2001 From: Vladimir Saveliev Date: Mon, 11 Sep 2023 22:32:00 +0300 Subject: [PATCH] LU-14708 ptlrpc: skip unnecessary client eviction A server does not update last_rcvd file on connection of new clients synchronously. If the server fails over before the last_rcvd update is committed, recently connected client may find itself evicted unexpectedly. If a client has not cached any data from a server and has not performed any modifying rpcs to the server - let the client to connect as a new one instead of considering itself as evicted. Test to illustrate the issue is included. Fixes: dcc8b9c00d5 "LU-9679 ptlrpc: list_for_each improvements" Change-Id: I0c2d9c3b67cbc69c3283422f1f581b42f7f13a1a HPE-bug-id: LUS-7141 Signed-off-by: Vladimir Saveliev Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/43834 Tested-by: Maloo Tested-by: jenkins Reviewed-by: Oleg Drokin Reviewed-by: Andreas Dilger Reviewed-by: Mikhail Pershin --- lustre/include/lustre_import.h | 3 ++- lustre/ptlrpc/client.c | 4 ++++ lustre/ptlrpc/import.c | 16 +++++++++++----- lustre/tests/recovery-small.sh | 21 +++++++++++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 71b9d14..a4df13e 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -328,7 +328,8 @@ struct obd_import { /* grant shrink disabled */ imp_grant_shrink_disabled:1, /* to supress LCONSOLE() at conn.restore */ - imp_was_idle:1; + imp_was_idle:1, + imp_no_cached_data:1; u32 imp_connect_op; u32 imp_idle_timeout; u32 imp_idle_debug; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index d0f058e..57e012c 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1563,6 +1563,10 @@ static int after_reply(struct ptlrpc_request *req) lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); } + if (lustre_msg_get_transno(req->rq_repmsg) || + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_ENQUEUE) + imp->imp_no_cached_data = 0; + if (imp->imp_replayable) { /* if other threads are waiting for ptlrpc_free_committed() * they could continue the work of freeing RPCs. That reduces diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index f327708..1145862 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -640,7 +640,7 @@ static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) return 1; } if (!list_empty(&imp->imp_replay_list)) { - req = list_first_entry(&imp->imp_committed_list, + req = list_first_entry(&imp->imp_replay_list, struct ptlrpc_request, rq_replay_list); *transno = req->rq_transno; if (req->rq_transno == 0) { @@ -1168,6 +1168,8 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, imp->imp_remote_handle = *lustre_msg_get_handle(request->rq_repmsg); + imp->imp_no_cached_data = 1; + /* Initial connects are allowed for clients with non-random * uuids when servers are in recovery. Simply signal the * servers replay is complete and wait in REPLAY_WAIT. @@ -1282,12 +1284,16 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, *lustre_msg_get_handle(request->rq_repmsg); import_set_state(imp, LUSTRE_IMP_RECOVER); } else { - DEBUG_REQ(D_HA, request, - "%s: evicting (reconnect/recover flags not set: %x)", - imp->imp_obd->obd_name, msg_flags); imp->imp_remote_handle = *lustre_msg_get_handle(request->rq_repmsg); - import_set_state(imp, LUSTRE_IMP_EVICTED); + if (!imp->imp_no_cached_data) { + DEBUG_REQ(D_HA, request, + "%s: evicting (reconnect/recover flags not set: %x)", + imp->imp_obd->obd_name, msg_flags); + import_set_state(imp, LUSTRE_IMP_EVICTED); + } else { + ptlrpc_activate_import(imp, true); + } } /* Sanity checks for a reconnected import. */ diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index edfa244..003f549 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3466,6 +3466,27 @@ test_154() { } run_test 154 "corruption update llog can be skipped" +test_155() { + local lsoutput1 + local lsoutput2 + + touch $DIR/$tfile + lsoutput1=$(ls -l $DIR) + + zconf_umount $HOSTNAME $MOUNT || error "umount failed" + # make sure that last_rcvd update is committed + do_facet mds1 sync + zconf_mount $HOSTNAME $MOUNT || error "mount failed" + + replay_barrier_nosync mds1 + + fail_nodf mds1 + + lsoutput2=$(ls -l $DIR) || error "ls failed" + [[ $lsoutput1 == $lsoutput2 ]] || error "$lsoutput1 != $lsoutput2" +} +run_test 155 "failover after client remount" + complete_test $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1