X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-small.sh;h=03ce07c86e26fa7a002e1cc7eb843a6e420351c7;hp=62d20516cd8c49c6967329a581956ccdaded095e;hb=bebf1e3f09144ca59cc6b0c2b61c65290626ada8;hpb=aa6773d95b0212e5bfe2a187e679322c1ac9eb8e diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 62d2051..03ce07c 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -20,8 +20,8 @@ require_dsh_mds || exit 0 [ "$SLOW" = "no" ] && EXCEPT_SLOW="17 26a 26b 50 51 57" [ $(facet_fstype $SINGLEMDS) = "zfs" ] && -# bug number for skipped test: LU-2194 LU-2547 - ALWAYS_EXCEPT="$ALWAYS_EXCEPT 19b 24a 24b" +# bug number for skipped test: LU-2547 + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 24a 24b" build_test_filter @@ -149,21 +149,140 @@ test_9() { run_test 9 "pause bulk on OST (bug 1420)" #bug 1521 -test_10() { - do_facet client mcreate $DIR/$tfile || - { error "mcreate failed: $?"; return 1; } - drop_bl_callback "chmod 0777 $DIR/$tfile" || echo "evicted as expected" - # wait for the mds to evict the client - #echo "sleep $(($TIMEOUT*2))" - #sleep $(($TIMEOUT*2)) - do_facet client touch $DIR/$tfile || echo "touch failed, evicted" - do_facet client checkstat -v -p 0777 $DIR/$tfile || - { error "client checkstat failed: $?"; return 3; } - do_facet client "munlink $DIR/$tfile" - # allow recovery to complete - client_up || client_up || sleep $TIMEOUT +test_10a() { + local before=$(date +%s) + local evict + + do_facet client "stat $DIR > /dev/null" || + error "failed to stat $DIR: $?" + drop_bl_callback "chmod 0777 $DIR" || + error "failed to chmod $DIR: $?" + + # let the client reconnect + client_reconnect + evict=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') + [ ! -z "$evict" ] && [[ $evict -gt $before ]] || + (do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state; + error "no eviction: $evict before:$before") + + do_facet client checkstat -v -p 0777 $DIR || + error "client checkstat failed: $?" +} +run_test 10a "finish request on server after client eviction (bug 1521)" + +test_10b() { + local before=$(date +%s) + local evict + + [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.53) ]] && + skip "Need MDS version at least 2.6.53" && return + do_facet client "stat $DIR > /dev/null" || + error "failed to stat $DIR: $?" + drop_bl_callback_once "chmod 0777 $DIR" || + error "failed to chmod $DIR: $?" + + # let the client reconnect + client_reconnect + evict=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') + + [ -z "$evict" ] || [[ $evict -le $before ]] || + (do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state; + error "eviction happened: $evict before:$before") + + do_facet client checkstat -v -p 0777 $DIR || + error "client checkstat failed: $?" +} +run_test 10b "re-send BL AST" + +test_10c() { + local before=$(date +%s) + local evict + local mdccli + local mdcpath + local conn_uuid + local workdir + local pid + local rc + + workdir="${DIR}/${tdir}" + mkdir -p ${workdir} || error "can't create workdir $?" + stat ${workdir} > /dev/null || + error "failed to stat ${workdir}: $?" + mdtidx=$($LFS getdirstripe -i ${workdir}) + mdtname=$($LFS mdts ${workdir} | grep -e "^$mdtidx:" | + awk '{sub("_UUID", "", $2); print $2;}') + #assume one client + mdccli=$($LCTL dl | grep "${mdtname}-mdc" | awk '{print $4;}') + conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid) + mdcpath="mdc.${mdccli}.import=connection=${conn_uuid}" + + drop_bl_callback_once "chmod 0777 ${workdir}" & + pid=$! + + # let chmod blocked + sleep 1 + # force client reconnect + $LCTL set_param "${mdcpath}" + + # wait client reconnect + client_reconnect + wait $pid + rc=$? + evict=$($LCTL get_param mdc.${mdccli}.state | + awk -F"[ [,]" '/EVICTED]$/ { if (t<$4) {t=$4;} } END { print t }') + + [[ $evict -le $before ]] || + ( $LCTL get_param mdc.$FSNAME-MDT*.state; + error "eviction happened: $EVICT before:$BEFORE" ) + + [ $rc -eq 0 ] || error "chmod must finished OK" + checkstat -v -p 0777 "${workdir}" || + error "client checkstat failed: $?" } -run_test 10 "finish request on server after client eviction (bug 1521)" +run_test 10c "re-send BL AST vs reconnect race (LU-5569)" + +test_10d() { + local before=$(date +%s) + local evict + + [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.90) ]] && + skip "Need MDS version at least 2.6.90" && return + + # sleep 1 is to make sure that BEFORE is not equal to EVICTED below + sleep 1 + rm -f $TMP/$tfile + echo -n ", world" | dd of=$TMP/$tfile bs=1c seek=5 + + mount_client $MOUNT2 + + cancel_lru_locks osc + $LFS setstripe -i 0 -c 1 $DIR1/$tfile + echo -n hello > $DIR1/$tfile + + stat $DIR2/$tfile >& /dev/null + $LCTL set_param fail_err=71 + drop_bl_callback "echo -n \\\", world\\\" >> $DIR2/$tfile" + + client_reconnect + + cmp $DIR1/$tfile $DIR2/$tfile || error "file contents differ" + cmp $DIR1/$tfile $TMP/$tfile || error "wrong content found" + + evict=$(do_facet client $LCTL get_param osc.$FSNAME-OST0000*.state | \ + tr -d '\-\[\] ' | \ + awk -F"[ [,]" '/EVICTED$/ { if (mx<$1) {mx=$1;} } END { print mx }') + + [[ $evict -gt $before ]] || + (do_facet client $LCTL get_param osc.$FSNAME-OST0000*.state; + error "no eviction: $evict before:$before") + + $LCTL set_param fail_err=0 + rm $TMP/$tfile + umount_client $MOUNT2 +} +run_test 10d "test failed blocking ast" #bug 2460 # wake up a thread waiting for completion after eviction @@ -177,7 +296,8 @@ test_11(){ do_facet client $MULTIOP $DIR/$tfile or || { error "multiop read failed: $?"; return 3; } - drop_bl_callback $MULTIOP $DIR/$tfile Ow || echo "evicted as expected" + drop_bl_callback_once $MULTIOP $DIR/$tfile Ow || + echo "evicted as expected" do_facet client munlink $DIR/$tfile || { error "munlink failed: $?"; return 4; } @@ -492,7 +612,7 @@ test_19c() { # let the client reconnect sleep 5 EVICT=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | - awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }') + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') [ -z "$EVICT" ] || [[ $EVICT -le $BEFORE ]] || error "eviction happened" } @@ -911,7 +1031,8 @@ run_test 27 "fail LOV while using OSC's" test_28() { # bug 6086 - error adding new clients do_facet client mcreate $DIR/$tfile || return 1 - drop_bl_callback "chmod 0777 $DIR/$tfile" ||echo "evicted as expected" + drop_bl_callback_once "chmod 0777 $DIR/$tfile" || + echo "evicted as expected" #define OBD_FAIL_MDS_CLIENT_ADD 0x12f do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012f" # fail once (evicted), reconnect fail (fail_loc), ok @@ -1163,10 +1284,10 @@ test_58() { # bug 11546 pid=$! sleep 1 lctl set_param fail_loc=0 - drop_bl_callback rm -f $DIR/$tfile + drop_bl_callback_once rm -f $DIR/$tfile wait $pid # the first 'df' could tigger the eviction caused by - # 'drop_bl_callback', and it's normal case. + # 'drop_bl_callback_once', and it's normal case. # but the next 'df' should return successfully. do_facet client "df $DIR" || do_facet client "df $DIR" } @@ -1265,12 +1386,14 @@ test_61() replay_barrier $SINGLEMDS createmany -o $DIR/$tdir/$tfile-%d 10 - local oid=`do_facet ost1 "lctl get_param -n obdfilter.${ost1_svc}.last_id"` + local oid=$(do_facet ost1 "lctl get_param -n \ + obdfilter.${ost1_svc}.last_id" | sed -e 's/.*://') fail_abort $SINGLEMDS - + touch $DIR/$tdir/$tfile - local id=`$LFS getstripe $DIR/$tdir/$tfile | awk '$1 == 0 { print $2 }'` + local id=$($LFS getstripe $DIR/$tdir/$tfile | + awk '$1 == 0 { print $2 }') [ $id -le $oid ] && error "the orphan objid was reused, failed" # Cleanup @@ -1288,6 +1411,72 @@ run_test 61 "Verify to not reuse orphan objects - bug 17025" #} #run_test 62 "Verify connection flags race - bug LU-1716" +test_66() +{ + [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.51) ]] || + { skip "Need MDS version at least 2.7.51"; return 0; } + + local list=$(comma_list $(osts_nodes)) + + # modify dir so that next revalidate would not obtain UPDATE lock + touch $DIR + + # drop 1 reply with UPDATE lock + mcreate $DIR/$tfile || error "mcreate failed: $?" + drop_ldlm_reply_once "stat $DIR/$tfile" & + sleep 2 + + # make the re-sent lock to sleep +#define OBD_FAIL_MDS_RESEND 0x136 + do_nodes $list $LCTL set_param fail_loc=0x80000136 + + #initiate the re-connect & re-send + local mdccli=$($LCTL dl | awk '/-mdc-/ {print $4;}') + local conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid) + $LCTL set_param "mdc.${mdccli}.import=connection=${conn_uuid}" + sleep 2 + + #initiate the client eviction while enqueue re-send is in progress + mds_evict_client + + client_reconnect + wait +} +run_test 66 "lock enqueue re-send vs client eviction" + +test_65() { + mount_client $DIR2 + + #grant lock1, export2 + $SETSTRIPE -i -0 $DIR2/$tfile || return 1 + $MULTIOP $DIR2/$tfile Ow || return 2 + +#define OBD_FAIL_LDLM_BL_EVICT 0x31e + do_facet ost $LCTL set_param fail_loc=0x31e + #get waiting lock2, export1 + $MULTIOP $DIR/$tfile Ow & + PID1=$! + # let enqueue to get asleep + sleep 2 + + #get lock2 blocked + $MULTIOP $DIR2/$tfile Ow & + PID2=$! + sleep 2 + + #evict export1 + ost_evict_client + + sleep 2 + do_facet ost $LCTL set_param fail_loc=0 + + wait $PID1 + wait $PID2 + + umount_client $DIR2 +} +run_test 65 "lock enqueue for destroyed export" + check_cli_ir_state() { local NODE=${1:-$HOSTNAME} @@ -1568,10 +1757,9 @@ test_105() # get one of the clients from client list local rcli=$(echo $RCLIENTS |cut -d' ' -f 1) - local old_MOUNTOPT=$MOUNTOPT - MOUNTOPT=${MOUNTOPT},noir + local mount_opts=${MOUNT_OPTS:+$MOUNT_OPTS,}noir zconf_umount $rcli $MOUNT || error "umount failed" - zconf_mount $rcli $MOUNT || error "mount failed" + zconf_mount $rcli $MOUNT $mount_opts || error "mount failed" # make sure lustre mount at $rcli disabling IR local ir_state=$(check_cli_ir_state $rcli) @@ -1593,8 +1781,7 @@ test_105() [ $ir_state = "DISABLED" -o $ir_state = "OFF" ] || error "IR status on ost1 should be DISABLED" - # restore it - MOUNTOPT=$old_MOUNTOPT + # remount with the default MOUNT_OPTS zconf_umount $rcli $MOUNT || error "umount failed" zconf_mount $rcli $MOUNT || error "mount failed" @@ -1683,6 +1870,23 @@ test_107 () { } run_test 107 "drop reint reply, then restart MDT" +test_108() { + mkdir -p $DIR/$tdir + $SETSTRIPE -c 1 -i 0 $DIR/$tdir + + dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=256 & + local dd_pid=$! + sleep 0.1 + + ost_evict_client + + wait $dd_pid + + client_up || error "reconnect failed" + rm -f $DIR/$tdir/$tfile +} +run_test 108 "client eviction don't crash" + test_110a () { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 local remote_dir=$DIR/$tdir/remote_dir @@ -1792,9 +1996,9 @@ test_110g () { createmany -o $remote_dir/f 100 - #define OBD_FAIL_MIGRATE_NET_REP 0x1702 - do_facet mds$MDTIDX lctl set_param fail_loc=0x1702 - $LFS mv -M $MDTIDX $remote_dir || error "migrate failed" + #define OBD_FAIL_MIGRATE_NET_REP 0x1800 + do_facet mds$MDTIDX lctl set_param fail_loc=0x1800 + $LFS migrate -m $MDTIDX $remote_dir || error "migrate failed" do_facet mds$MDTIDX lctl set_param fail_loc=0x0 for file in $(find $remote_dir); do @@ -1807,6 +2011,77 @@ test_110g () { } run_test 110g "drop reply during migration" +test_110h () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local src_dir=$DIR/$tdir/source_dir + local tgt_dir=$DIR/$tdir/target_dir + local MDTIDX=1 + + mkdir -p $src_dir + $LFS mkdir -i $MDTIDX $tgt_dir + + dd if=/etc/hosts of=$src_dir/src_file + touch $tgt_dir/tgt_file + drop_update_reply $MDTIDX \ + "mrename $src_dir/src_file $tgt_dir/tgt_file" || + error "mrename failed" + + $CHECKSTAT -t file $src_dir/src_file && + error "src_file present after rename" + + diff /etc/hosts $tgt_dir/tgt_file || + error "file changed after rename" + +} +run_test 110h "drop update reply during cross-MDT file rename" + +test_110i () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local src_dir=$DIR/$tdir/source_dir + local tgt_dir=$DIR/$tdir/target_dir + local MDTIDX=1 + + mkdir -p $src_dir + $LFS mkdir -i $MDTIDX $tgt_dir + + mkdir $src_dir/src_dir + touch $src_dir/src_dir/a + mkdir $tgt_dir/tgt_dir + drop_update_reply $MDTIDX \ + "mrename $src_dir/src_dir $tgt_dir/tgt_dir" || + error "mrename failed" + + $CHECKSTAT -t dir $src_dir/src_dir && + error "src_dir present after rename" + + $CHECKSTAT -t dir $tgt_dir/tgt_dir || + error "tgt_dir not present after rename" + + $CHECKSTAT -t file $tgt_dir/tgt_dir/a || + error "a not present after rename" +} +run_test 110i "drop update reply during cross-MDT dir rename" + +test_110j () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local remote_dir=$DIR/$tdir/remote_dir + local local_dir=$DIR/$tdir/local_dir + local MDTIDX=1 + + mkdir -p $DIR/$tdir + mkdir $DIR/$tdir/local_dir + $LFS mkdir -i $MDTIDX $remote_dir + + touch $local_dir/local_file + drop_update_reply $MDTIDX \ + "ln $local_dir/local_file $remote_dir/remote_file" || + error "ln failed" + + $CHECKSTAT -t file $remote_dir/remote_file || + error "remote not present after ln" +} +run_test 110j "drop update reply during cross-MDT ln" + # LU-2844 mdt prepare fail should not cause umount oops test_111 () { @@ -1971,12 +2246,104 @@ test_113() { # let the client reconnect client_reconnect EVICT=$($LCTL get_param mdc.$FSNAME-MDT*.state | - awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }') + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') [ -z "$EVICT" ] || [[ $EVICT -le $BEFORE ]] || error "eviction happened" } run_test 113 "ldlm enqueue dropped reply should not cause deadlocks" +T130_PID=0 +test_130_base() { + test_mkdir -p $DIR/$tdir + + # Prevent interference from layout intent RPCs due to + # asynchronous writeback. These will be tested in 130c below. + do_nodes ${CLIENTS:-$HOSTNAME} sync + + # get only LOOKUP lock on $tdir + cancel_lru_locks mdc + ls $DIR/$tdir/$tfile 2>/dev/null + + # get getattr by fid on $tdir + # + # we need to race with unlink, unlink must complete before we will + # take a DLM lock, otherwise unlink will wait until getattr will + # complete; but later than getattr starts so that getattr found + # the object +#define OBD_FAIL_MDS_INTENT_DELAY 0x160 + set_nodes_failloc "$(mdts_nodes)" 0x80000160 + stat $DIR/$tdir & + T130_PID=$! + sleep 2 + + rm -rf $DIR/$tdir + + # drop the reply so that resend happens on an unlinked file. +#define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157 + set_nodes_failloc "$(mdts_nodes)" 0x80000157 +} + +test_130a() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return + test_130_base + + wait $T130_PID || [ $? -eq 0 ] && error "stat should fail" + return 0 +} +run_test 130a "enqueue resend on not existing file" + +test_130b() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return + test_130_base + # let the reply to be dropped + sleep 10 + +#define OBD_FAIL_SRV_ENOENT 0x217 + set_nodes_failloc "$(mdts_nodes)" 0x80000217 + + wait $T130_PID || [ $? -eq 0 ] && error "stat should fail" + return 0 +} +run_test 130b "enqueue resend on a stale inode" + +test_130c() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return + + do_nodes ${CLIENTS:-$HOSTNAME} sync + echo XXX > $DIR/$tfile + + cancel_lru_locks mdc + + # Trigger writeback on $tfile. + # + # we need to race with unlink, unlink must complete before we will + # take a DLM lock, otherwise unlink will wait until intent will + # complete; but later than intent starts so that intent found + # the object +#define OBD_FAIL_MDS_INTENT_DELAY 0x160 + set_nodes_failloc "$(mdts_nodes)" 0x80000160 + sync & + T130_PID=$! + sleep 2 + + rm $DIR/$tfile + + # drop the reply so that resend happens on an unlinked file. +#define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157 + set_nodes_failloc "$(mdts_nodes)" 0x80000157 + + # let the reply to be dropped + sleep 10 + +#define OBD_FAIL_SRV_ENOENT 0x217 + set_nodes_failloc "$(mdts_nodes)" 0x80000217 + + wait $T130_PID + + return 0 +} +run_test 130c "layout intent resend on a stale inode" + complete $SECONDS check_and_cleanup_lustre exit_status