X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-small.sh;h=6ce8a57330bf5c1310e096f244a2d07763cb956c;hb=814bb394843434883a94fe6432cd8c656035a3e1;hp=231ad43bd9b46621be6c09188edfed5cd11e0401;hpb=3341c8c31871ad5bcea914260643bf164194ee9a;p=fs%2Flustre-release.git diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 231ad43..6ce8a57 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2,25 +2,23 @@ set -e -# bug 5493 LU2034 -ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT" - -export MULTIOP=${MULTIOP:-multiop} PTLDEBUG=${PTLDEBUG:--1} -LUSTRE=${LUSTRE:-`dirname $0`/..} +LUSTRE=${LUSTRE:-$(dirname $0)/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ -. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -require_dsh_mds || exit 0 - -# also long tests: 19, 21a, 21e, 21f, 23, 27 - -[ "$SLOW" = "no" ] && EXCEPT_SLOW="" +ALWAYS_EXCEPT="$RECOVERY_SMALL_EXCEPT " +if $SHARED_KEY; then + # bug number for skipped test: LU-12896 + ALWAYS_EXCEPT+=" 110k" + # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +fi build_test_filter +require_dsh_mds || exit 0 + # Allow us to override the setup if we already have a mounted system by # setting SETUP=" " and CLEANUP=" " SETUP=${SETUP:-""} @@ -171,8 +169,8 @@ test_10b() { local before=$(date +%s) local evict - [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.53) ]] && - skip "Need MDS version at least 2.6.53" && return + [[ "$MDS1_VERSION" -lt $(version_code 2.6.53) ]] && + skip "Need MDS version at least 2.6.53" do_facet client "stat $DIR > /dev/null" || error "failed to stat $DIR: $?" drop_bl_callback_once "chmod 0777 $DIR" || @@ -243,8 +241,8 @@ test_10d() { local before=$(date +%s) local evict - [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.90) ]] && - skip "Need MDS version at least 2.6.90" && return + [[ "$MDS1_VERSION" -lt $(version_code 2.6.90) ]] && + skip "Need MDS version at least 2.6.90" # sleep 1 is to make sure that BEFORE is not equal to EVICTED below sleep 1 @@ -284,11 +282,11 @@ run_test 10d "test failed blocking ast" test_10e() { - [[ $(lustre_version_code ost1) -le $(version_code 2.8.58) ]] && - skip "Need OST version at least 2.8.59" && return 0 - [ $CLIENTCOUNT -lt 2 ] && skip "need two clients" && return 0 + [[ "$OST1_VERSION" -le $(version_code 2.8.58) ]] && + skip "Need OST version at least 2.8.59" + [ $CLIENTCOUNT -lt 2 ] && skip "need two clients" [ $(facet_host client) == $(facet_host ost1) ] && - skip "need ost1 and client on different nodes" && return 0 + skip "need ost1 and client on different nodes" local -a clients=(${CLIENTS//,/ }) local client1=${clients[0]} local client2=${clients[1]} @@ -1231,7 +1229,7 @@ test_51() { for i in $SEQ do #echo failover in $i sec - log "test_$testnum: failover in $i sec" + log "$TESTNAME: failover in $i sec" sleep $i facet_failover $SINGLEMDS done @@ -1294,15 +1292,16 @@ test_53() { run_test 53 "touch: drop rep" test_54() { - zconf_mount `hostname` $MOUNT2 - touch $DIR/$tfile - touch $DIR2/$tfile.1 - sleep 10 - cat $DIR2/$tfile.missing # save transno = 0, rc != 0 into last_rcvd - fail $SINGLEMDS - umount $MOUNT2 - ERROR=`dmesg | egrep "(test 54|went back in time)" | tail -n1 | grep "went back in time"` - [ x"$ERROR" == x ] || error "back in time occured" + zconf_mount $(hostname) $MOUNT2 + touch $DIR/$tfile + touch $DIR2/$tfile.1 + sleep 10 + cat $DIR2/$tfile.missing # save transno = 0, rc != 0 into last_rcvd + fail $SINGLEMDS + umount $MOUNT2 + ERROR=$(dmesg | egrep "(test 54|went back in time)" | tail -n1 | + grep "went back in time") + [ x"$ERROR" == x ] || error "back in time occured" } run_test 54 "back in time" @@ -1322,7 +1321,7 @@ test_55() { # Minimum pass speed is 2MBps local ddtimeout=64 # LU-2887/LU-3089 - set min pass speed to 500KBps - [ "$(facet_fstype ost1)" = "zfs" ] && ddtimeout=256 + [ "$ost1_FSTYPE" = zfs ] && ddtimeout=256 # first dd should be finished quickly $LFS setstripe -c 1 -i 0 $DIR/$tdir/$tfile-1 @@ -1531,8 +1530,8 @@ test_65() { mount_client $DIR2 #grant lock1, export2 - $SETSTRIPE -i -0 $DIR2/$tfile || return 1 - $MULTIOP $DIR2/$tfile Ow || return 2 + $LFS setstripe -i -0 $DIR2/$tfile || error "setstripe failed" + $MULTIOP $DIR2/$tfile Ow || error "multiop failed" #define OBD_FAIL_LDLM_BL_EVICT 0x31e do_facet ost $LCTL set_param fail_loc=0x31e @@ -1562,8 +1561,8 @@ run_test 65 "lock enqueue for destroyed export" test_66() { - [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.51) ]] || - { skip "Need MDS version at least 2.7.51"; return 0; } + [[ "$MDS1_VERSION" -ge $(version_code 2.7.51) ]] || + skip "Need MDS version at least 2.7.51" local list=$(comma_list $(osts_nodes)) @@ -1580,7 +1579,8 @@ test_66() do_nodes $list $LCTL set_param fail_loc=0x80000136 #initiate the re-connect & re-send - local mdccli=$($LCTL dl | awk '/-MDT0000-mdc-/ {print $4;}') + local mdtname="MDT0000" + local mdccli=$($LCTL dl | grep "${mdtname}-mdc" | awk '{print $4;}') local conn_uuid=$($LCTL get_param -n mdc.${mdccli}.conn_uuid) $LCTL set_param "mdc.${mdccli}.import=connection=${conn_uuid}" sleep 2 @@ -1928,7 +1928,7 @@ test_105() # Since the client just mounted, its last_rcvd entry is not on disk. # Send an RPC so exp_need_sync forces last_rcvd to commit this export # so the client can reconnect during OST recovery (LU-924, LU-1582) - $SETSTRIPE -i 0 $DIR/$tfile + $LFS setstripe -i 0 $DIR/$tfile dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 conv=sync # make sure MGS's state is Partial @@ -1964,8 +1964,8 @@ cleanup_106() { } test_106() { # LU-1789 - [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.3.50) ]] || - { skip "Need MDS version at least 2.3.50"; return 0; } + [[ "$MDS1_VERSION" -ge $(version_code 2.3.50) ]] || + skip "Need MDS version at least 2.3.50" #define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 $LCTL set_param fail_loc=0x805 @@ -1991,9 +1991,8 @@ test_106() { # LU-1789 # lightweight goes through LUSTRE_IMP_RECOVER during failover touch -c $DIR2/$tfile || true $LCTL dk $TMP/lustre-log-$TESTNAME.log - recovered=`awk '/MDT0000-mdc-[0-9a-f]*: lwp recover/ { - print; - }' $TMP/lustre-log-$TESTNAME.log` + recovered=$(awk '/MDT0000-mdc-[0-9a-f]*. lwp recover/ { print }' \ + $TMP/lustre-log-$TESTNAME.log) [ -z "$recovered" ] && error "lightweight client was not recovered" # and all operations performed by lightweight client should be @@ -2031,7 +2030,7 @@ run_test 107 "drop reint reply, then restart MDT" test_108() { mkdir -p $DIR/$tdir - $SETSTRIPE -c 1 -i 0 $DIR/$tdir + $LFS setstripe -c 1 -i 0 $DIR/$tdir dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=256 & local dd_pid=$! @@ -2094,7 +2093,7 @@ test_110c () { drop_update_reply $mdtidx "$LFS mkdir -i $mdtidx -c2 $remote_dir" || error "lfs mkdir failed" - diridx=$($GETSTRIPE -m $remote_dir) + diridx=$($LFS getstripe -m $remote_dir) [ $diridx -eq $mdtidx ] || error "$diridx != $mdtidx" rm -rf $DIR/$tdir || error "rmdir failed" @@ -2147,10 +2146,10 @@ test_110f () { run_test 110f "remove remote directory: drop slave rep" test_110g () { - [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.11.0) ]] || - { skip "Need MDS version at least 2.11.0"; return 0; } + [[ "$MDS1_VERSION" -ge $(version_code 2.11.0) ]] || + skip "Need MDS version at least 2.11.0" - [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" mkdir -p $DIR/$tdir touch $DIR/$tdir/$tfile @@ -2174,10 +2173,9 @@ test_110g () { run_test 110g "drop reply during migration" test_110h () { - [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 - local server_version=$(lustre_version_code mds1) - [[ $server_version -ge $(version_code 2.7.56) ]] || - { skip "Need MDS version at least 2.7.56"; return 0; } + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" + [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] || + skip "Need MDS version at least 2.7.56" local src_dir=$DIR/$tdir/source_dir local tgt_dir=$DIR/$tdir/target_dir @@ -2202,10 +2200,9 @@ test_110h () { run_test 110h "drop update reply during cross-MDT file rename" test_110i () { - [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 - local server_version=$(lustre_version_code mds1) - [[ $server_version -ge $(version_code 2.7.56) ]] || - { skip "Need MDS version at least 2.7.56"; return 0; } + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" + [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] || + skip "Need MDS version at least 2.7.56" local src_dir=$DIR/$tdir/source_dir local tgt_dir=$DIR/$tdir/target_dir @@ -2233,10 +2230,9 @@ test_110i () { run_test 110i "drop update reply during cross-MDT dir rename" test_110j () { - [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 - local server_version=$(lustre_version_code mds1) - [[ $server_version -ge $(version_code 2.7.56) ]] || - { skip "Need MDS version at least 2.7.56"; return 0; } + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" + [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] || + skip "Need MDS version at least 2.7.56" local remote_dir=$DIR/$tdir/remote_dir local local_dir=$DIR/$tdir/local_dir @@ -2256,19 +2252,43 @@ test_110j () { } run_test 110j "drop update reply during cross-MDT ln" +test_110k() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTS" + [[ "$MDS1_VERSION" -ge $(version_code 2.12.55) ]] || + skip "Need MDS version at least 2.12.55" + + stop mds2 || error "stop mds2 failed" + umount $MOUNT + +#define OBD_FAIL_FLD_QUERY_REQ 0x1103 + do_facet mds2 lctl set_param fail_loc=0x1103 + local OPTS="$MDS_MOUNT_OPTS -o abort_recovery" + start mds2 $(mdsdevname 2) $OPTS || + error "start MDS with abort_recovery should succeed" + do_facet mds2 lctl set_param fail_loc=0 + + # cleanup + stop mds2 || error "cleanup: stop mds2 failed" + start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || + error "cleanup: start mds2 failed" + zconf_mount $(hostname) $MOUNT || error "cleanup: mount failed" + client_up || error "post-failover df failed" +} +run_test 110k "FID_QUERY failed during recovery" + # LU-2844 mdt prepare fail should not cause umount oops test_111 () { - [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.3.62) ]] || - { skip "Need MDS version at least 2.3.62"; return 0; } + [[ "$MDS1_VERSION" -ge $(version_code 2.3.62) ]] || + skip "Need MDS version at least 2.3.62" #define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 do_facet $SINGLEMDS lctl set_param fail_loc=0x151 stop $SINGLEMDS || error "stop MDS failed" - start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) && + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS && error "start MDS should fail" do_facet $SINGLEMDS lctl set_param fail_loc=0 - start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) || + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS || error "start MDS failed" } run_test 111 "mdd setup fail should not cause umount oops" @@ -2339,8 +2359,8 @@ test_115_write() { } test_115a() { - [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] && - skip "need at least 2.8.50 on OST" && return 0 + [ "$OST1_VERSION" -lt $(version_code 2.8.50) ] && + skip "need at least 2.8.50 on OST" #define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b #define OBD_FAIL_PTLRPC_DROP_BULK 0x51a @@ -2349,8 +2369,8 @@ test_115a() { run_test 115a "read: late REQ MDunlink and no bulk" test_115b() { - [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] && - skip "need at least 2.8.50 on OST" && return 0 + [ "$OST1_VERSION" -lt $(version_code 2.8.50) ] && + skip "need at least 2.8.50 on OST" #define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b #define OBD_FAIL_OST_ENOSPC 0x215 @@ -2362,8 +2382,8 @@ test_115b() { run_test 115b "write: late REQ MDunlink and no bulk" test_115c() { - [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] && - skip "need at least 2.8.50 on OST" && return 0 + [ "$OST1_VERSION" -lt $(version_code 2.8.50) ] && + skip "need at least 2.8.50 on OST" #define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f #define OBD_FAIL_PTLRPC_DROP_BULK 0x51a @@ -2372,8 +2392,8 @@ test_115c() { run_test 115c "read: late Reply MDunlink and no bulk" test_115d() { - [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] && - skip "need at least 2.8.50 on OST" && return 0 + [ "$OST1_VERSION" -lt $(version_code 2.8.50) ] && + skip "need at least 2.8.50 on OST" #define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f #define OBD_FAIL_OST_ENOSPC 0x215 @@ -2382,8 +2402,8 @@ test_115d() { run_test 115d "write: late Reply MDunlink and no bulk" test_115e() { - [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] && - skip "need at least 2.8.50 on OST" && return 0 + [ "$OST1_VERSION" -lt $(version_code 2.8.50) ] && + skip "need at least 2.8.50 on OST" #define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 @@ -2392,8 +2412,8 @@ test_115e() { run_test 115e "read: late Bulk MDunlink and no reply" test_115f() { - [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] && - skip "need at least 2.8.50 on OST" && return 0 + [ "$OST1_VERSION" -lt $(version_code 2.8.50) ] && + skip "need at least 2.8.50 on OST" #define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 @@ -2402,8 +2422,8 @@ test_115f() { run_test 115f "read: late REQ MDunlink and no reply" test_115g() { - [ $(lustre_version_code ost1) -lt $(version_code 2.8.50) ] && - skip "need at least 2.8.50 on OST" && return 0 + [ "$OST1_VERSION" -lt $(version_code 2.8.50) ] && + skip "need at least 2.8.50 on OST" #define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c test_115_read 0x8000051c 0 @@ -2576,10 +2596,9 @@ test_130_base() { } test_130a() { - remote_mds_nodsh && skip "remote MDS with nodsh" && return - local server_version=$(lustre_version_code $SINGLEMDS) - [[ $server_version -ge $(version_code 2.7.2) ]] || - { skip "Need server version newer than 2.7.1"; return 0; } + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ "$MDS1_VERSION" -ge $(version_code 2.7.2) ]] || + skip "Need server version newer than 2.7.1" test_130_base @@ -2589,10 +2608,9 @@ test_130a() { run_test 130a "enqueue resend on not existing file" test_130b() { - remote_mds_nodsh && skip "remote MDS with nodsh" && return - local server_version=$(lustre_version_code $SINGLEMDS) - [[ $server_version -ge $(version_code 2.7.2) ]] || - { skip "Need server version newer than 2.7.1"; return 0; } + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ "$MDS1_VERSION" -ge $(version_code 2.7.2) ]] || + skip "Need server version newer than 2.7.1" test_130_base # let the reply to be dropped @@ -2652,7 +2670,7 @@ test_132() { rm -f $DIR/$tfile # get a lock on client so that export would reach the stale list - $SETSTRIPE -i 0 $DIR/$tfile || error "setstripe failed" + $LFS setstripe -i 0 $DIR/$tfile || error "setstripe failed" dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync || error "dd failed" @@ -2684,7 +2702,7 @@ test_131() { rm -f $DIR/$tfile # get a lock on client so that export would reach the stale list - $SETSTRIPE -i 0 $DIR/$tfile || error "setstripe failed" + $LFS setstripe -i 0 $DIR/$tfile || error "setstripe failed" dd if=/dev/zero of=$DIR/$tfile count=1 || error "dd failed" # another IO under the same lock @@ -2759,7 +2777,7 @@ test_134() { run_test 134 "race between failover and search for reply data free slot" test_135() { - [ $MDS1_VERSION -lt $(version_code 2.12.51) ] && + [ "$MDS1_VERSION" -lt $(version_code 2.12.51) ] && skip "Need MDS version at least 2.12.51" mkdir -p $DIR/$tdir @@ -2773,6 +2791,251 @@ test_135() { } run_test 135 "DOM: open/create resend to return size" +test_136() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ "$MDS1_VERSION" -ge $(version_code 2.12.52) ]] || + skip "Need MDS version at least 2.12.52" + + local mdts=$(comma_list $(mdts_nodes)) + local MDT0=$(facet_svc $SINGLEMDS) + + local clog=$(do_facet mds1 $LCTL --device $MDT0 changelog_register -n) + [ -n "$clog" ] || error "changelog_register failed" + cl_mask=$(do_facet mds1 $LCTL get_param \ + mdd.$MDT0.changelog_mask -n) + changelog_chmask "ALL" + + # generate some changelog records to accumulate + test_mkdir -i 0 -c 0 $DIR/$tdir || error "mkdir $tdir failed" + createmany -m $DIR/$tdir/$tfile 10000 || + error "create $DIR/$tdir/$tfile failed" + + local size1=$(do_facet $SINGLEMDS \ + $LCTL get_param -n mdd.$MDT0.changelog_size) + echo "Changelog size $size1" + + #define OBD_FAIL_LLOG_PURGE_DELAY 0x1318 + do_nodes $mdts $LCTL set_param fail_loc=0x1318 fail_val=30 + + # launch changelog_deregister in background on MDS + do_facet mds1 "nohup $LCTL --device $MDT0 changelog_deregister $clog \ + > foo.out 2> foo.err < /dev/null &" + # give time to reach fail_loc + sleep 15 + + # fail_loc will make MDS sleep in the middle of changelog_deregister + # take this opportunity to abruptly kill MDS + FAILURE_MODE_save=$FAILURE_MODE + FAILURE_MODE=HARD + fail mds1 + FAILURE_MODE=$FAILURE_MODE_save + + do_nodes $mdts $LCTL set_param fail_loc=0x0 fail_val=0 + + local size2=$(do_facet $SINGLEMDS \ + $LCTL get_param -n mdd.$MDT0.changelog_size) + echo "Changelog size $size2" + local clog2=$(do_facet $SINGLEMDS "$LCTL get_param -n \ + mdd.$MDT0.changelog_users | grep $clog") + echo "After crash, changelog user $clog2" + + [ -n "$clog2" -o $size2 -lt $size1 ] || + error "changelog record count unchanged" + + do_facet mds1 $LCTL set_param mdd.$MDT0.changelog_mask=\'$cl_mask\' -n +} +run_test 136 "changelog_deregister leaving pending records" + +test_137() { + df $DIR + mkdir -p $DIR/d1 + mkdir -p $DIR/d2 + dd if=/dev/zero of=$DIR/d1/$tfile bs=4096 count=1 + dd if=/dev/zero of=$DIR/d2/$tfile bs=4096 count=1 + cancel_lru_locks osc + + #define OBD_FAIL_PTLRPC_RESEND_RACE 0x525 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000525" + + # RPC1: any reply is to be delayed to disable last_xid logic + ln $DIR/d1/$tfile $DIR/d1/f2 & + sleep 1 + + # RPC2: setattr1 reply is delayed & resent + # original reply comes to client; the resend get asleep + chmod 666 $DIR/d2/$tfile + + # RPC3: setattr2 on the same file; run ahead of RPC2 resend + chmod 777 $DIR/d2/$tfile + + # RPC2 resend wakes up + sleep 5 + [ $(stat -c "%a" $DIR/d2/$tfile) == 777 ] || error "resend got applied" +} +run_test 137 "late resend must be skipped if already applied" + +test_138() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [[ "$MDS1_VERSION" -ge $(version_code 2.12.59) ]] || + skip "Need server version newer than 2.12.59" + + zconf_umount_clients $CLIENTS $MOUNT + +#define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724 + #delay a first step of recovey when MDS waiting clients + #and failing to get osp logs + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x724 fail_val=5 + + facet_failover $SINGLEMDS + + #waiting failover and recovery timer + #the valuse is based on target_recovery_overseer() wait_event timeout + sleep 55 + stop $SINGLEMDS || error "stop MDS failed" + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) || + error "start MDS failed" + zconf_mount_clients $CLIENTS $MOUNT +} +run_test 138 "Umount MDT during recovery" + +test_139() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $MDS1_VERSION -lt $(version_code 2.13.50) ] && + skip "Need MDS version at least 2.13.50" + + mdt_dev=$(mdsdevname 1) + + stop $SINGLEMDS || error "stop $SINGLEMDS failed" + +#define OBD_FAIL_OSP_INVALID_LOGID 0x2106 + do_facet $SINGLEMDS $LCTL set_param fail_val=0x68 fail_loc=0x80002106 + start $SINGLEMDS $mdt_dev $MDS_MOUNT_OPTS || error "Fail to start MDT" +} +run_test 139 "corrupted catid won't cause crash" + +test_140a() { + [ $MDS1_VERSION -lt $(version_code 2.12.58) ] && + skip "Need MDS version at least 2.13.50" + + [ "$SHARED_KEY" = true ] && + skip "server local client incompatible with SSK keys installed" + + slr=$(do_facet mds1 \ + $LCTL get_param -n mdt.$FSNAME-MDT0000.local_recovery) + stack_trap "do_facet mds1 $LCTL set_param \ + mdt.*.local_recovery=$slr" EXIT + + # disable recovery for local clients + # so local clients should be marked with no_recovery flag + do_facet mds1 $LCTL set_param mdt.*.local_recovery=0 + mount_mds_client + + local cnt + cnt=$(do_facet mds1 $LCTL get_param "mdt.*.exports.*.export" | + grep export_flags.*no_recovery | wc -l) + echo "$cnt clients with recovery disabled" + umount_mds_client + [ $cnt -eq 0 ] && error "no clients with recovery disabled" + + # enable recovery for local clients + # so no local clients should be marked with no_recovery flag + do_facet mds1 $LCTL set_param mdt.*.local_recovery=1 + mount_mds_client + + cnt=$(do_facet mds1 $LCTL get_param "mdt.*.exports.*.export" | + grep export_flags.*no_recovery | wc -l) + echo "$cnt clients with recovery disabled" + umount_mds_client + [ $cnt -eq 0 ] || error "$cnt clients with recovery disabled" +} +run_test 140a "local mount is flagged properly" + +test_140b() { + [ $MDS1_VERSION -lt $(version_code 2.12.58) ] && + skip "Need MDS version at least 2.13.50" + + [ "$SHARED_KEY" = true ] && + skip "server local client incompatible with SSK keys installed" + + slr=$(do_facet mds1 \ + $LCTL get_param -n mdt.$FSNAME-MDT0000.local_recovery) + stack_trap "do_facet mds1 $LCTL set_param \ + mdt.*.local_recovery=$slr" EXIT + + # disable recovery for local clients + do_facet mds1 $LCTL set_param mdt.*.local_recovery=0 + + mount_mds_client + replay_barrier mds1 + umount_mds_client + fail mds1 + local recovery=$(do_facet mds1 dmesg | + awk -F: '/Recovery over after/ { print $4 }' | + cut -d, -f1 | tail -1) + (( $recovery < $TIMEOUT*2 )) || + error "recovery took too long $recovery > $((TIMEOUT * 2))" +} +run_test 140b "local mount is excluded from recovery" + +test_141() { + local oldc + local newc + + [ $PARALLEL == "yes" ] && skip "skip parallel run" + combined_mgs_mds || skip "needs combined MGS/MDT" + ( local_mode || from_build_tree ) && + skip "cannot run in local mode or from build tree" + + # some get_param have a bug to handle dot in param name + do_rpc_nodes $(facet_active_host $SINGLEMDS) cancel_lru_locks MGC + oldc=$(do_facet $SINGLEMDS $LCTL get_param -n \ + 'ldlm.namespaces.MGC*.lock_count') + fail $SINGLEMDS + do_rpc_nodes $(facet_active_host $SINGLEMDS) cancel_lru_locks MGC + newc=$(do_facet $SINGLEMDS $LCTL get_param -n \ + 'ldlm.namespaces.MGC*.lock_count') + + [ $oldc -eq $newc ] || error "mgc lost locks ($oldc != $newc)" + return 0 +} +run_test 141 "do not lose locks on MGS restart" + +test_142() { + [ $MDS1_VERSION -lt $(version_code 2.11.56) ] && + skip "Need MDS version at least 2.11.56" + + #define OBD_FAIL_MDS_ORPHAN_DELETE 0x165 + do_facet mds1 $LCTL set_param fail_loc=0x165 + $MULTIOP $DIR/$tfile Ouc || error "multiop failed" + + stop mds1 + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS + + wait_update_facet mds1 "pgrep orph_.*-MDD | wc -l" "0" || + error "MDD orphan cleanup thread not quit" +} +run_test 142 "orphan name stub can be cleaned up in startup" + +test_143() { + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.13.00) ] && + skip "Need MDS version at least 2.13.00" + [ $PARALLEL == "yes" ] && skip "skip parallel run" + + local mntpt=$(facet_mntpt $SINGLEMDS) + stop mds1 + mount_fstype $SINGLEMDS || error "mount as fstype $SINGLEMDS failed" + do_facet $SINGLEMDS touch $mntpt/PENDING/$tfile + unmount_fstype $SINGLEMDS + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || error "mds1 start fail" + + wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" + wait_update_facet mds1 "pgrep orph_.*-MDD | wc -l" "0" || + error "MDD orphan cleanup thread not quit" +} +run_test 143 "orphan cleanup thread shouldn't be blocked even delete failed" + complete $SECONDS check_and_cleanup_lustre exit_status