Whamcloud - gitweb
LU-17297 grant: move tgt_grant_sanity_check() calls 71/53171/4
authorVladimir Saveliev <vladimir.saveliev@hpe.com>
Fri, 17 Nov 2023 15:30:06 +0000 (18:30 +0300)
committerOleg Drokin <green@whamcloud.com>
Thu, 15 Feb 2024 07:07:33 +0000 (07:07 +0000)
Call tgt_grant_sanity_check() in ofd_obd_disconnect() and in
mdt_obd_disconnect() after call to tgt_grant_discard().

Otherwise, sum of grants does not match to total grant counter which
is reported as LustreError:
    ofd_obd_disconnect: tot_granted 0 != fo_tot_granted 8388608

This is because on stale export eviction
class_disconnect_stale_exports() moves stale exports to separate list
but does not update obd's grant counters.

Test to illustrate the issue is included.

HPE-bug-id: LUS-11469
Signed-off-by: Vladimir Saveliev <vladimir.saveliev@hpe.com>
Change-Id: I0b4568b88a2fe7b50f4eac50b4b064d7afbc7a75
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53171
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Patrick Farrell <pfarrell@whamcloud.com>
Reviewed-by: Mikhail Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/include/obd_support.h
lustre/mdt/mdt_handler.c
lustre/ofd/ofd_obd.c
lustre/ptlrpc/client.c
lustre/tests/recovery-small.sh

index 736ed87..574d907 100644 (file)
@@ -478,6 +478,7 @@ extern bool obd_enable_health_write;
 #define OBD_FAIL_PTLRPC_IDLE_RACE       0x533
 #define OBD_FAIL_PTLRPC_ENQ_RESEND      0x534
 #define OBD_FAIL_PTLRPC_DELAY_SEND_FAIL         0x535
+#define OBD_FAIL_PTLRPC_REPLAY_PAUSE    0x536
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 /*     OBD_FAIL_OBD_LOG_CANCEL_NET      0x601 obsolete since 1.5 */
index 3a2a3a2..934b9d8 100644 (file)
@@ -7019,9 +7019,6 @@ static int mdt_obd_disconnect(struct obd_export *exp)
        LASSERT(exp);
        class_export_get(exp);
 
-       if (!(exp->exp_flags & OBD_OPT_FORCE))
-               tgt_grant_sanity_check(exp->exp_obd, __func__);
-
        if (OCD_HAS_FLAG(data, MDS_MDS) && !OCD_HAS_FLAG(data, LIGHTWEIGHT) &&
            atomic_dec_and_test(&mdt->mdt_mds_mds_conns))
                mdt_disable_slc(mdt);
@@ -7037,6 +7034,9 @@ static int mdt_obd_disconnect(struct obd_export *exp)
 
        tgt_grant_discard(exp);
 
+       if (!(exp->exp_flags & OBD_OPT_FORCE))
+               tgt_grant_sanity_check(exp->exp_obd, __func__);
+
        rc = mdt_export_cleanup(exp);
        nodemap_del_member(exp);
        class_export_put(exp);
index f935898..15e5a2b 100644 (file)
@@ -426,13 +426,13 @@ int ofd_obd_disconnect(struct obd_export *exp)
        LASSERT(exp);
        class_export_get(exp);
 
-       if (!(exp->exp_flags & OBD_OPT_FORCE))
-               tgt_grant_sanity_check(ofd_obd(ofd), __func__);
-
        rc = server_disconnect_export(exp);
 
        tgt_grant_discard(exp);
 
+       if (!(exp->exp_flags & OBD_OPT_FORCE))
+               tgt_grant_sanity_check(ofd_obd(ofd), __func__);
+
        /* Do not erase record for recoverable client. */
        if (exp->exp_obd->obd_replayable &&
            (!exp->exp_obd->obd_fail || exp->exp_failed)) {
index c8f75ef..96fde33 100644 (file)
@@ -3367,6 +3367,8 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
 
        LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
 
+       CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_REPLAY_PAUSE, cfs_fail_val);
+
        aa = ptlrpc_req_async_args(aa, req);
        memset(aa, 0, sizeof(*aa));
 
index 6e4d17c..736ddd5 100755 (executable)
@@ -3559,6 +3559,50 @@ test_155() {
 }
 run_test 155 "failover after client remount"
 
+test_156()
+{
+       # on failover recovery time hard will be 9 * 5
+       local saved_timeout=$(do_facet ost1 $LCTL get_param -n timeout)
+
+       do_facet mgs $LCTL set_param -P timeout=5 ||
+               error "failed to set obd_timeout"
+       stack_trap "do_facet mgs $LCTL set_param -P timeout=$saved_timeout" \
+           EXIT
+
+       $LFS setstripe -c 1 -i 0 $DIR/$tfile || error "setstripe failed"
+
+       # this is to sync last_rcvd, so that the client will have to
+       # send replay on recovery
+       $LFS df $MOUNT
+       do_facet ost1 sync
+
+       replay_barrier ost1
+
+       $MULTIOP $DIR/$tfile oO_RDWR:O_SYNC:w1048576c || error "multiop failed"
+
+       # delay write replay for 45 sec (OBD_RECOVERY_TIME_HARD) to
+       # get the client evicted as not sending replays
+
+#define OBD_FAIL_PTLRPC_REPLAY_PAUSE    0x536
+       $LCTL set_param fail_loc=0x80000536 fail_val=45
+
+       fail ost1
+
+       # check that ost1 evicted the client in recovery
+       local clients
+       clients=($(do_facet ost1 \
+               $LCTL get_param -n obdfilter.$FSNAME-OST0000.recovery_status |
+               awk '/completed_clients/ { print $2 }' | tr '/' '\n'))
+       [[ $((${clients[0]} + 1)) == ${clients[1]} ]] ||
+               error "client not evicted by ost1"
+
+       local testid=$(echo $TESTNAME | tr '_' ' ')
+       do_facet ost1 dmesg | tac | sed "/$testid/,$ d" |
+               grep "ofd_obd_disconnect: tot_granted" &&
+               error "grant miscount" || true
+}
+run_test 156 "tot_granted miscount after client eviction"
+
 complete_test $SECONDS
 check_and_cleanup_lustre
 exit_status