From 9df01eee755bbac5bed560f365fab85c1b1164ae Mon Sep 17 00:00:00 2001 From: Vladimir Saveliev Date: Fri, 17 Nov 2023 18:30:06 +0300 Subject: [PATCH] LU-17297 grant: move tgt_grant_sanity_check() calls Call tgt_grant_sanity_check() in ofd_obd_disconnect() and in mdt_obd_disconnect() after call to tgt_grant_discard(). Otherwise, sum of grants does not match to total grant counter which is reported as LustreError: ofd_obd_disconnect: tot_granted 0 != fo_tot_granted 8388608 This is because on stale export eviction class_disconnect_stale_exports() moves stale exports to separate list but does not update obd's grant counters. Test to illustrate the issue is included. HPE-bug-id: LUS-11469 Signed-off-by: Vladimir Saveliev Change-Id: I0b4568b88a2fe7b50f4eac50b4b064d7afbc7a75 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53171 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Patrick Farrell Reviewed-by: Mikhail Pershin Reviewed-by: Oleg Drokin Reviewed-by: Andreas Dilger --- lustre/include/obd_support.h | 1 + lustre/mdt/mdt_handler.c | 6 +++--- lustre/ofd/ofd_obd.c | 6 +++--- lustre/ptlrpc/client.c | 2 ++ lustre/tests/recovery-small.sh | 44 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 736ed87..574d907 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -478,6 +478,7 @@ extern bool obd_enable_health_write; #define OBD_FAIL_PTLRPC_IDLE_RACE 0x533 #define OBD_FAIL_PTLRPC_ENQ_RESEND 0x534 #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL 0x535 +#define OBD_FAIL_PTLRPC_REPLAY_PAUSE 0x536 #define OBD_FAIL_OBD_PING_NET 0x600 /* OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 obsolete since 1.5 */ diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 3a2a3a2..934b9d8 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -7019,9 +7019,6 @@ static int mdt_obd_disconnect(struct obd_export *exp) LASSERT(exp); class_export_get(exp); - if (!(exp->exp_flags & OBD_OPT_FORCE)) - tgt_grant_sanity_check(exp->exp_obd, __func__); - if (OCD_HAS_FLAG(data, MDS_MDS) && !OCD_HAS_FLAG(data, LIGHTWEIGHT) && atomic_dec_and_test(&mdt->mdt_mds_mds_conns)) mdt_disable_slc(mdt); @@ -7037,6 +7034,9 @@ static int mdt_obd_disconnect(struct obd_export *exp) tgt_grant_discard(exp); + if (!(exp->exp_flags & OBD_OPT_FORCE)) + tgt_grant_sanity_check(exp->exp_obd, __func__); + rc = mdt_export_cleanup(exp); nodemap_del_member(exp); class_export_put(exp); diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index f935898..15e5a2b 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -426,13 +426,13 @@ int ofd_obd_disconnect(struct obd_export *exp) LASSERT(exp); class_export_get(exp); - if (!(exp->exp_flags & OBD_OPT_FORCE)) - tgt_grant_sanity_check(ofd_obd(ofd), __func__); - rc = server_disconnect_export(exp); tgt_grant_discard(exp); + if (!(exp->exp_flags & OBD_OPT_FORCE)) + tgt_grant_sanity_check(ofd_obd(ofd), __func__); + /* Do not erase record for recoverable client. */ if (exp->exp_obd->obd_replayable && (!exp->exp_obd->obd_fail || exp->exp_failed)) { diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index c8f75ef..96fde33 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -3367,6 +3367,8 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); + CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_REPLAY_PAUSE, cfs_fail_val); + aa = ptlrpc_req_async_args(aa, req); memset(aa, 0, sizeof(*aa)); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 6e4d17c..736ddd5 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3559,6 +3559,50 @@ test_155() { } run_test 155 "failover after client remount" +test_156() +{ + # on failover recovery time hard will be 9 * 5 + local saved_timeout=$(do_facet ost1 $LCTL get_param -n timeout) + + do_facet mgs $LCTL set_param -P timeout=5 || + error "failed to set obd_timeout" + stack_trap "do_facet mgs $LCTL set_param -P timeout=$saved_timeout" \ + EXIT + + $LFS setstripe -c 1 -i 0 $DIR/$tfile || error "setstripe failed" + + # this is to sync last_rcvd, so that the client will have to + # send replay on recovery + $LFS df $MOUNT + do_facet ost1 sync + + replay_barrier ost1 + + $MULTIOP $DIR/$tfile oO_RDWR:O_SYNC:w1048576c || error "multiop failed" + + # delay write replay for 45 sec (OBD_RECOVERY_TIME_HARD) to + # get the client evicted as not sending replays + +#define OBD_FAIL_PTLRPC_REPLAY_PAUSE 0x536 + $LCTL set_param fail_loc=0x80000536 fail_val=45 + + fail ost1 + + # check that ost1 evicted the client in recovery + local clients + clients=($(do_facet ost1 \ + $LCTL get_param -n obdfilter.$FSNAME-OST0000.recovery_status | + awk '/completed_clients/ { print $2 }' | tr '/' '\n')) + [[ $((${clients[0]} + 1)) == ${clients[1]} ]] || + error "client not evicted by ost1" + + local testid=$(echo $TESTNAME | tr '_' ' ') + do_facet ost1 dmesg | tac | sed "/$testid/,$ d" | + grep "ofd_obd_disconnect: tot_granted" && + error "grant miscount" || true +} +run_test 156 "tot_granted miscount after client eviction" + complete_test $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1