X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-small.sh;h=af5c2097fb9556eabb74d5fcddcd4bb61f1a75dd;hp=ed8ce3aad61d8ca4a26579d779aa3c9846d455a1;hb=8f01f8b51d114b0d2d54a5ab7db3161782e52447;hpb=757a18894de5948b8d8aab053c0b883c64d27782 diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index ed8ce3a..af5c209 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -19,6 +19,10 @@ require_dsh_mds || exit 0 # 1 2.5 2.5 4 4 (min)" [ "$SLOW" = "no" ] && EXCEPT_SLOW="17 26a 26b 50 51 57" +[ $(facet_fstype $SINGLEMDS) = "zfs" ] && +# bug number for skipped test: LU-2547 + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 24a 24b" + build_test_filter # Allow us to override the setup if we already have a mounted system by @@ -145,21 +149,92 @@ test_9() { run_test 9 "pause bulk on OST (bug 1420)" #bug 1521 -test_10() { - do_facet client mcreate $DIR/$tfile || - { error "mcreate failed: $?"; return 1; } - drop_bl_callback "chmod 0777 $DIR/$tfile" || echo "evicted as expected" - # wait for the mds to evict the client - #echo "sleep $(($TIMEOUT*2))" - #sleep $(($TIMEOUT*2)) - do_facet client touch $DIR/$tfile || echo "touch failed, evicted" - do_facet client checkstat -v -p 0777 $DIR/$tfile || - { error "client checkstat failed: $?"; return 3; } - do_facet client "munlink $DIR/$tfile" - # allow recovery to complete - client_up || client_up || sleep $TIMEOUT +test_10a() { + local before=$(date +%s) + local evict + + do_facet client "stat $DIR > /dev/null" || + error "failed to stat $DIR: $?" + drop_bl_callback "chmod 0777 $DIR" || + error "failed to chmod $DIR: $?" + + # let the client reconnect + client_reconnect + evict=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') + [ ! -z "$evict" ] && [[ $evict -gt $before ]] || + (do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state; + error "no eviction: $evict before:$before") + + do_facet client checkstat -v -p 0777 $DIR || + error "client checkstat failed: $?" +} +run_test 10a "finish request on server after client eviction (bug 1521)" + +test_10b() { + local before=$(date +%s) + local evict + + [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.53) ]] && + skip "Need MDS version at least 2.6.53" && return + do_facet client "stat $DIR > /dev/null" || + error "failed to stat $DIR: $?" + drop_bl_callback_once "chmod 0777 $DIR" || + error "failed to chmod $DIR: $?" + + # let the client reconnect + client_reconnect + evict=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') + + [ -z "$evict" ] || [[ $evict -le $before ]] || + (do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state; + error "eviction happened: $evict before:$before") + + do_facet client checkstat -v -p 0777 $DIR || + error "client checkstat failed: $?" +} +run_test 10b "re-send BL AST" + +test_10d() { + local before=$(date +%s) + local evict + + [[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.90) ]] && + skip "Need MDS version at least 2.6.90" && return + + # sleep 1 is to make sure that BEFORE is not equal to EVICTED below + sleep 1 + rm -f $TMP/$tfile + echo -n ", world" | dd of=$TMP/$tfile bs=1c seek=5 + + mount_client $MOUNT2 + + cancel_lru_locks osc + $LFS setstripe -i 0 -c 1 $DIR1/$tfile + echo -n hello > $DIR1/$tfile + + stat $DIR2/$tfile >& /dev/null + $LCTL set_param fail_err=71 + drop_bl_callback "echo -n \\\", world\\\" >> $DIR2/$tfile" + + client_reconnect + + cmp $DIR1/$tfile $DIR2/$tfile || error "file contents differ" + cmp $DIR1/$tfile $TMP/$tfile || error "wrong content found" + + evict=$(do_facet client $LCTL get_param osc.$FSNAME-OST0000*.state | \ + tr -d '\-\[\] ' | \ + awk -F"[ [,]" '/EVICTED$/ { if (mx<$1) {mx=$1;} } END { print mx }') + + [[ $evict -gt $before ]] || + (do_facet client $LCTL get_param osc.$FSNAME-OST0000*.state; + error "no eviction: $evict before:$before") + + rm $TMP/$tfile + umount_client $MOUNT2 } -run_test 10 "finish request on server after client eviction (bug 1521)" +run_test 10d "test failed blocking ast" #bug 2460 # wake up a thread waiting for completion after eviction @@ -173,7 +248,8 @@ test_11(){ do_facet client $MULTIOP $DIR/$tfile or || { error "multiop read failed: $?"; return 3; } - drop_bl_callback $MULTIOP $DIR/$tfile Ow || echo "evicted as expected" + drop_bl_callback_once $MULTIOP $DIR/$tfile Ow || + echo "evicted as expected" do_facet client munlink $DIR/$tfile || { error "munlink failed: $?"; return 4; } @@ -260,6 +336,7 @@ test_16() { sleep $TIMEOUT do_facet client "cmp $TMP/$tfile $DIR/$tfile" || return 2 start_read_ahead + rm -f $TMP/$tfile } run_test 16 "timeout bulk put, don't evict client (2732)" @@ -328,7 +405,7 @@ test_18a() { rc=0 pgcache_empty || rc=2 $LCTL --device $osc2dev activate - rm -f $f + rm -f $f $TMP/$tfile return $rc } run_test 18a "manual ost invalidate clears page cache immediately" @@ -361,7 +438,7 @@ test_18b() { # cache after the client reconnects? rc=0 pgcache_empty || rc=2 - rm -f $f + rm -f $f $TMP/$tfile return $rc } run_test 18b "eviction and reconnect clears page cache (2766)" @@ -400,7 +477,7 @@ test_18c() { # cache after the client reconnects? rc=0 pgcache_empty || rc=2 - rm -f $f + rm -f $f $TMP/$tfile return $rc } run_test 18c "Dropped connect reply after eviction handing (14755)" @@ -411,6 +488,9 @@ test_19a() { mount_client $DIR2 || error "failed to mount $DIR2" + # cancel cached locks from OST to avoid eviction from it + cancel_lru_locks osc + do_facet client "stat $DIR > /dev/null" || error "failed to stat $DIR: $?" drop_ldlm_cancel "chmod 0777 $DIR2" || @@ -420,8 +500,9 @@ test_19a() { # let the client reconnect client_reconnect - EVICT=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | \ - awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }') + EVICT=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | + awk -F"[ [,]" '/EVICTED ]$/ \ + { if (mx<$5) {mx=$5;} } END { print mx }') [ ! -z "$EVICT" ] && [[ $EVICT -gt $BEFORE ]] || (do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state; @@ -435,6 +516,9 @@ test_19b() { mount_client $DIR2 || error "failed to mount $DIR2: $?" + # cancel cached locks from MDT to avoid eviction from it + cancel_lru_locks mdc + do_facet client $MULTIOP $DIR/$tfile Ow || error "failed to run multiop: $?" drop_ldlm_cancel $MULTIOP $DIR2/$tfile Ow || @@ -446,8 +530,9 @@ test_19b() { # let the client reconnect client_reconnect - EVICT=$(do_facet client $LCTL get_param osc.$FSNAME-OST*.state | \ - awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }') + EVICT=$(do_facet client $LCTL get_param osc.$FSNAME-OST*.state | + awk -F"[ [,]" '/EVICTED ]$/ \ + { if (mx < $5) {mx = $5;} } END { print mx }') [ ! -z "$EVICT" ] && [[ $EVICT -gt $BEFORE ]] || (do_facet client $LCTL get_param osc.$FSNAME-OST*.state; @@ -479,7 +564,7 @@ test_19c() { # let the client reconnect sleep 5 EVICT=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | - awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }') + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') [ -z "$EVICT" ] || [[ $EVICT -le $BEFORE ]] || error "eviction happened" } @@ -757,7 +842,8 @@ test_24a() { # bug 11710 details correct fsync() behavior rc=$? lctl set_param fail_loc=0x0 client_reconnect - [ $rc -eq 0 ] && error_ignore 5494 "multiop didn't fail fsync: rc $rc" || true + [ $rc -eq 0 ] && + error_ignore bz5494 "multiop didn't fail fsync: rc $rc" || true } run_test 24a "fsync error (should return error)" @@ -797,10 +883,10 @@ test_24b() { lctl set_param fail_loc=0x0 client_reconnect [ $rc1 -eq 0 -o $rc2 -eq 0 ] && - error_ignore 5494 "multiop didn't fail fsync: $rc1 or close: $rc2" || + error_ignore bz5494 "multiop didn't fail fsync: $rc1 or close: $rc2" || true - dmesg | grep "dirty page discard:" || \ + dmesg | grep "dirty page discard:" || error "no discarded dirty page found!" } run_test 24b "test dirty page discard due to client eviction" @@ -897,7 +983,8 @@ run_test 27 "fail LOV while using OSC's" test_28() { # bug 6086 - error adding new clients do_facet client mcreate $DIR/$tfile || return 1 - drop_bl_callback "chmod 0777 $DIR/$tfile" ||echo "evicted as expected" + drop_bl_callback_once "chmod 0777 $DIR/$tfile" || + echo "evicted as expected" #define OBD_FAIL_MDS_CLIENT_ADD 0x12f do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012f" # fail once (evicted), reconnect fail (fail_loc), ok @@ -913,7 +1000,7 @@ test_29a() { # bug 22273 - error adding new clients # fail abort so client will be new again fail_abort $SINGLEMDS client_up || error "reconnect failed" - wait_osc_import_state mds ost FULL + wait_osc_import_state $SINGLEMDS ost FULL return 0 } run_test 29a "error adding new clients doesn't cause LBUG (bug 22273)" @@ -949,7 +1036,8 @@ test_50() { rc=$? echo writemany returned $rc #these may fail because of eviction due to slow AST response. - [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true + [ $rc -eq 0 ] || + error_ignore bz13652 "writemany returned rc $rc" || true } run_test 50 "failover MDS under load" @@ -978,10 +1066,11 @@ test_51() { # and recovery was interrupted sleep $TIMEOUT kill -USR1 $CLIENT_PID - wait $CLIENT_PID + wait $CLIENT_PID rc=$? echo writemany returned $rc - [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true + [ $rc -eq 0 ] || + error_ignore bz13652 "writemany returned rc $rc" || true } run_test 51 "failover MDS during recovery" @@ -1050,57 +1139,58 @@ test_55() { mkdir -p $DIR/$tdir + # Minimum pass speed is 2MBps + local ddtimeout=64 + # LU-2887/LU-3089 - set min pass speed to 500KBps + [ "$(facet_fstype ost1)" = "zfs" ] && ddtimeout=256 + # first dd should be finished quickly $LFS setstripe -c 1 -i 0 $DIR/$tdir/$tfile-1 - dd if=/dev/zero of=$DIR/$tdir/$tfile-1 bs=32M count=4 & + dd if=/dev/zero of=$DIR/$tdir/$tfile-1 bs=32M count=4 & DDPID=$! count=0 echo "step1: testing ......" - while [ true ]; do - if [ -z `ps x | awk '$1 == '$DDPID' { print $5 }'` ]; then break; fi - count=$[count+1] - if [ $count -gt 64 ]; then - error "dd should be finished!" - fi - sleep 1 - done + while kill -0 $DDPID 2> /dev/null; do + let count++ + if [ $count -gt $ddtimeout ]; then + error "dd should be finished!" + fi + sleep 1 + done echo "(dd_pid=$DDPID, time=$count)successful" $LFS setstripe -c 1 -i 0 $DIR/$tdir/$tfile-2 #define OBD_FAIL_OST_DROP_REQ 0x21d do_facet ost1 lctl set_param fail_loc=0x0000021d # second dd will be never finished - dd if=/dev/zero of=$DIR/$tdir/$tfile-2 bs=32M count=4 & + dd if=/dev/zero of=$DIR/$tdir/$tfile-2 bs=32M count=4 & DDPID=$! count=0 echo "step2: testing ......" - while [ $count -le 64 ]; do - dd_name="`ps x | awk '$1 == '$DDPID' { print $5 }'`" - if [ -z $dd_name ]; then - ls -l $DIR/$tdir - echo "debug: (dd_name=$dd_name, dd_pid=$DDPID, time=$count)" - error "dd shouldn't be finished!" - fi - count=$[count+1] - sleep 1 - done + while [ $count -le $ddtimeout ]; do + if ! kill -0 $DDPID 2> /dev/null; then + ls -l $DIR/$tdir + error "dd shouldn't be finished! (time=$count)" + fi + let count++ + sleep 1 + done echo "(dd_pid=$DDPID, time=$count)successful" #Recover fail_loc and dd will finish soon do_facet ost1 lctl set_param fail_loc=0 count=0 echo "step3: testing ......" - while [ true ]; do - if [ -z `ps x | awk '$1 == '$DDPID' { print $5 }'` ]; then break; fi - count=$[count+1] - if [ $count -gt 500 ]; then - error "dd should be finished!" - fi - sleep 1 - done + while kill -0 $DDPID 2> /dev/null; do + let count++ + if [ $count -gt $((ddtimeout + 440)) ]; then + error "dd should be finished!" + fi + sleep 1 + done echo "(dd_pid=$DDPID, time=$count)successful" - rm -rf $DIR/$tdir + rm -rf $DIR/$tdir } run_test 55 "ost_brw_read/write drops timed-out read/write request" @@ -1108,11 +1198,11 @@ test_56() { # b=11277 #define OBD_FAIL_MDS_RESEND 0x136 touch $DIR/$tfile do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000136" - stat $DIR/$tfile + stat $DIR/$tfile || error "stat failed" do_facet $SINGLEMDS "lctl set_param fail_loc=0" rm -f $DIR/$tfile } -run_test 56 "do not allow reconnect to busy exports" +run_test 56 "do not fail on getattr resend" test_57_helper() { # no oscs means no client or mdt @@ -1146,10 +1236,10 @@ test_58() { # bug 11546 pid=$! sleep 1 lctl set_param fail_loc=0 - drop_bl_callback rm -f $DIR/$tfile + drop_bl_callback_once rm -f $DIR/$tfile wait $pid # the first 'df' could tigger the eviction caused by - # 'drop_bl_callback', and it's normal case. + # 'drop_bl_callback_once', and it's normal case. # but the next 'df' should return successfully. do_facet client "df $DIR" || do_facet client "df $DIR" } @@ -1174,17 +1264,17 @@ test_59() { # bug 10589 run_test 59 "Read cancel race on client eviction" err17935 () { - # we assume that all md changes are in the MDT0 changelog - if [ $MDSCOUNT -gt 1 ]; then - error_ignore 17935 $* - else - error $* - fi + # we assume that all md changes are in the MDT0 changelog + if [ $MDSCOUNT -gt 1 ]; then + error_ignore bz17935 $* + else + error $* + fi } test_60() { - MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | \ - awk '{gsub(/_UUID/,""); print $1}' | head -1) + MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | + awk '{ gsub(/_UUID/,""); print $1 }' | head -n1) NUM_FILES=15000 mkdir -p $DIR/$tdir @@ -1248,12 +1338,14 @@ test_61() replay_barrier $SINGLEMDS createmany -o $DIR/$tdir/$tfile-%d 10 - local oid=`do_facet ost1 "lctl get_param -n obdfilter.${ost1_svc}.last_id"` + local oid=$(do_facet ost1 "lctl get_param -n \ + obdfilter.${ost1_svc}.last_id" | sed -e 's/.*://') fail_abort $SINGLEMDS - + touch $DIR/$tdir/$tfile - local id=`$LFS getstripe $DIR/$tdir/$tfile | awk '$1 == 0 { print $2 }'` + local id=$($LFS getstripe $DIR/$tdir/$tfile | + awk '$1 == 0 { print $2 }') [ $id -le $oid ] && error "the orphan objid was reused, failed" # Cleanup @@ -1271,6 +1363,72 @@ run_test 61 "Verify to not reuse orphan objects - bug 17025" #} #run_test 62 "Verify connection flags race - bug LU-1716" +test_66() +{ + [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.51) ]] || + { skip "Need MDS version at least 2.7.51"; return 0; } + + local list=$(comma_list $(osts_nodes)) + + # modify dir so that next revalidate would not obtain UPDATE lock + touch $DIR + + # drop 1 reply with UPDATE lock + mcreate $DIR/$tfile || error "mcreate failed: $?" + drop_ldlm_reply_once "stat $DIR/$tfile" & + sleep 2 + + # make the re-sent lock to sleep +#define OBD_FAIL_MDS_RESEND 0x136 + do_nodes $list $LCTL set_param fail_loc=0x80000136 + + #initiate the re-connect & re-send + local mdccli=$($LCTL dl | awk '/-mdc-/ {print $4;}') + local conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid) + $LCTL set_param "mdc.${mdccli}.import=connection=${conn_uuid}" + sleep 2 + + #initiate the client eviction while enqueue re-send is in progress + mds_evict_client + + client_reconnect + wait +} +run_test 66 "lock enqueue re-send vs client eviction" + +test_65() { + mount_client $DIR2 + + #grant lock1, export2 + $SETSTRIPE -i -0 $DIR2/$tfile || return 1 + $MULTIOP $DIR2/$tfile Ow || return 2 + +#define OBD_FAIL_LDLM_BL_EVICT 0x31e + do_facet ost $LCTL set_param fail_loc=0x31e + #get waiting lock2, export1 + $MULTIOP $DIR/$tfile Ow & + PID1=$! + # let enqueue to get asleep + sleep 2 + + #get lock2 blocked + $MULTIOP $DIR2/$tfile Ow & + PID2=$! + sleep 2 + + #evict export1 + ost_evict_client + + sleep 2 + do_facet ost $LCTL set_param fail_loc=0 + + wait $PID1 + wait $PID2 + + umount_client $DIR2 +} +run_test 65 "lock enqueue for destroyed export" + check_cli_ir_state() { local NODE=${1:-$HOSTNAME} @@ -1370,8 +1528,8 @@ target_instance_match() local target=${srv}_svc local si=$(do_facet $srv lctl get_param -n $obdname.${!target}.instance) - local ci=$(lctl get_param -n $cliname.${!target}-${cliname}-*.import | \ - awk '/instance/{ print $2 }' |head -1) + local ci=$(lctl get_param -n $cliname.${!target}-${cliname}-*.import | + awk '/instance/{ print $2 }' | head -n1) return $([ $si -eq $ci ]) } @@ -1494,10 +1652,11 @@ test_103() stop mds1 # We need this test because mds is like a client in IR context. - start mds1 $MDSDEV1 || error "MDS should start w/o mgs" + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || + error "MDS should start w/o mgs" # start mgs and remount mds w/ ir - start mgs $MGSDEV + start mgs $(mgsdevname) $MGS_MOUNT_OPTS clients_up # remount client so that fsdb will be created on the MGS @@ -1550,10 +1709,9 @@ test_105() # get one of the clients from client list local rcli=$(echo $RCLIENTS |cut -d' ' -f 1) - local old_MOUNTOPT=$MOUNTOPT - MOUNTOPT=${MOUNTOPT},noir + local mount_opts=${MOUNT_OPTS:+$MOUNT_OPTS,}noir zconf_umount $rcli $MOUNT || error "umount failed" - zconf_mount $rcli $MOUNT || error "mount failed" + zconf_mount $rcli $MOUNT $mount_opts || error "mount failed" # make sure lustre mount at $rcli disabling IR local ir_state=$(check_cli_ir_state $rcli) @@ -1575,8 +1733,7 @@ test_105() [ $ir_state = "DISABLED" -o $ir_state = "OFF" ] || error "IR status on ost1 should be DISABLED" - # restore it - MOUNTOPT=$old_MOUNTOPT + # remount with the default MOUNT_OPTS zconf_umount $rcli $MOUNT || error "umount failed" zconf_mount $rcli $MOUNT || error "mount failed" @@ -1665,13 +1822,38 @@ test_107 () { } run_test 107 "drop reint reply, then restart MDT" +test_108() { + mkdir -p $DIR/$tdir + $SETSTRIPE -c 1 -i 0 $DIR/$tdir + + dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=256 & + local dd_pid=$! + sleep 0.1 + + ost_evict_client + + wait $dd_pid + + client_up || error "reconnect failed" + rm -f $DIR/$tdir/$tfile +} +run_test 108 "client eviction don't crash" + test_110a () { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 local remote_dir=$DIR/$tdir/remote_dir local MDTIDX=1 + local num + + #prepare for 110 test, which need set striped dir on remote MDT. + for num in $(seq $MDSCOUNT); do + do_facet mds$num \ + lctl set_param -n mdt.${FSNAME}*.enable_remote_dir=1 \ + 2>/dev/null + done mkdir -p $DIR/$tdir - drop_request "$LFS mkdir -i $MDTIDX $remote_dir" || + drop_request "$LFS mkdir -i $MDTIDX -c2 $remote_dir" || error "lfs mkdir failed" local diridx=$($GETSTRIPE -M $remote_dir) [ $diridx -eq $MDTIDX ] || error "$diridx != $MDTIDX" @@ -1686,7 +1868,7 @@ test_110b () { local MDTIDX=1 mkdir -p $DIR/$tdir - drop_reint_reply "$LFS mkdir -i $MDTIDX $remote_dir" || + drop_reint_reply "$LFS mkdir -i $MDTIDX -c2 $remote_dir" || error "lfs mkdir failed" diridx=$($GETSTRIPE -M $remote_dir) @@ -1702,7 +1884,7 @@ test_110c () { local MDTIDX=1 mkdir -p $DIR/$tdir - drop_update_reply $((MDTIDX + 1)) "$LFS mkdir -i $MDTIDX $remote_dir" || + drop_update_reply $MDTIDX "$LFS mkdir -i $MDTIDX -c2 $remote_dir" || error "lfs mkdir failed" diridx=$($GETSTRIPE -M $remote_dir) @@ -1718,7 +1900,7 @@ test_110d () { local MDTIDX=1 mkdir -p $DIR/$tdir - $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" + $LFS mkdir -i $MDTIDX -c2 $remote_dir || error "lfs mkdir failed" drop_request "rm -rf $remote_dir" || error "rm remote dir failed" @@ -1734,7 +1916,7 @@ test_110e () { local MDTIDX=1 mkdir -p $DIR/$tdir - $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" + $LFS mkdir -i $MDTIDX -c2 $remote_dir || error "lfs mkdir failed" drop_reint_reply "rm -rf $remote_dir" || error "rm remote dir failed" rm -rf $DIR/$tdir || error "rmdir failed" @@ -1749,7 +1931,7 @@ test_110f () { local MDTIDX=1 mkdir -p $DIR/$tdir - $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" + $LFS mkdir -i $MDTIDX -c2 $remote_dir || error "lfs mkdir failed" drop_update_reply $MDTIDX "rm -rf $remote_dir" || error "rm remote dir failed" @@ -1757,6 +1939,271 @@ test_110f () { } run_test 110f "remove remote directory: drop slave rep" +test_110g () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local remote_dir=$DIR/$tdir/remote_dir + local MDTIDX=1 + + mkdir -p $remote_dir + + createmany -o $remote_dir/f 100 + + #define OBD_FAIL_MIGRATE_NET_REP 0x1702 + do_facet mds$MDTIDX lctl set_param fail_loc=0x1702 + $LFS migrate -m $MDTIDX $remote_dir || error "migrate failed" + do_facet mds$MDTIDX lctl set_param fail_loc=0x0 + + for file in $(find $remote_dir); do + mdt_index=$($LFS getstripe -M $file) + [ $mdt_index == $MDTIDX ] || + error "$file is not on MDT${MDTIDX}" + done + + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 110g "drop reply during migration" + +test_110h () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local src_dir=$DIR/$tdir/source_dir + local tgt_dir=$DIR/$tdir/target_dir + local MDTIDX=1 + + mkdir -p $src_dir + $LFS mkdir -i $MDTIDX $tgt_dir + + dd if=/etc/hosts of=$src_dir/src_file + touch $tgt_dir/tgt_file + drop_update_reply $MDTIDX \ + "mrename $src_dir/src_file $tgt_dir/tgt_file" || + error "mrename failed" + + $CHECKSTAT -t file $src_dir/src_file && + error "src_file present after rename" + + diff /etc/hosts $tgt_dir/tgt_file || + error "file changed after rename" + +} +run_test 110h "drop update reply during cross-MDT file rename" + +test_110i () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local src_dir=$DIR/$tdir/source_dir + local tgt_dir=$DIR/$tdir/target_dir + local MDTIDX=1 + + mkdir -p $src_dir + $LFS mkdir -i $MDTIDX $tgt_dir + + mkdir $src_dir/src_dir + touch $src_dir/src_dir/a + mkdir $tgt_dir/tgt_dir + drop_update_reply $MDTIDX \ + "mrename $src_dir/src_dir $tgt_dir/tgt_dir" || + error "mrename failed" + + $CHECKSTAT -t dir $src_dir/src_dir && + error "src_dir present after rename" + + $CHECKSTAT -t dir $tgt_dir/tgt_dir || + error "tgt_dir not present after rename" + + $CHECKSTAT -t file $tgt_dir/tgt_dir/a || + error "a not present after rename" +} +run_test 110i "drop update reply during cross-MDT dir rename" + +test_110j () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local remote_dir=$DIR/$tdir/remote_dir + local local_dir=$DIR/$tdir/local_dir + local MDTIDX=1 + + mkdir -p $DIR/$tdir + mkdir $DIR/$tdir/local_dir + $LFS mkdir -i $MDTIDX $remote_dir + + touch $local_dir/local_file + drop_update_reply $MDTIDX \ + "ln $local_dir/local_file $remote_dir/remote_file" || + error "ln failed" + + $CHECKSTAT -t file $remote_dir/remote_file || + error "remote not present after ln" +} +run_test 110j "drop update reply during cross-MDT ln" + +# LU-2844 mdt prepare fail should not cause umount oops +test_111 () +{ + [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.3.62) ]] || + { skip "Need MDS version at least 2.3.62"; return 0; } + + local mdsdev=$(mdsdevname ${SINGLEMDS//mds/}) +#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 + do_facet $SINGLEMDS lctl set_param fail_loc=0x151 + stop $SINGLEMDS || error "stop MDS failed" + start $SINGLEMDS $mdsdev && error "start MDS should fail" + do_facet $SINGLEMDS lctl set_param fail_loc=0 + start $SINGLEMDS $mdsdev || error "start MDS failed" +} +run_test 111 "mdd setup fail should not cause umount oops" + +# LU-793 +test_112a() { + remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + + do_facet_random_file client $TMP/$tfile 100K || + error_noexit "Create random file $TMP/$tfile" + + pause_bulk "cp $TMP/$tfile $DIR/$tfile" $TIMEOUT || + error_noexit "Can't pause_bulk copy" + + df $DIR + # expect cmp to succeed, client resent bulk + cmp $TMP/$tfile $DIR/$tfile || + error_noexit "Wrong data has been written" + rm $DIR/$tfile || + error_noexit "Can't remove file" + rm $TMP/$tfile +} +run_test 112a "bulk resend while orignal request is in progress" + +# parameters: fail_loc CMD RC +test_120_reply() { + local PID + local PID2 + local rc=5 + local fail + + #define OBD_FAIL_LDLM_CP_CB_WAIT2 0x320 + #define OBD_FAIL_LDLM_CP_CB_WAIT3 0x321 + #define OBD_FAIL_LDLM_CP_CB_WAIT4 0x322 + #define OBD_FAIL_LDLM_CP_CB_WAIT5 0x323 + + echo + echo -n "** FLOCK REPLY vs. EVICTION race, lock $2" + [ "$1" = "CLEANUP" ] && + fail=0x80000320 && echo ", $1 cp first" + [ "$1" = "REPLY" ] && + fail=0x80000321 && echo ", $1 cp first" + [ "$1" = "DEADLOCK CLEANUP" ] && + fail=0x80000322 && echo " DEADLOCK, CLEANUP cp first" + [ "$1" = "DEADLOCK REPLY" ] && + fail=0x80000323 && echo " DEADLOCK, REPLY cp first" + + if [ x"$2" = x"get" ]; then + #for TEST lock, take a conflict in advance + # sleep longer than evictor to not confuse fail_loc: 2+2+4 + echo "** Taking conflict **" + flocks_test 5 set read sleep 10 $DIR/$tfile & + PID2=$! + + sleep 2 + fi + + $LCTL set_param fail_loc=$fail + + flocks_test 5 $2 write $DIR/$tfile & + PID=$! + + sleep 2 + echo "** Evicting and re-connecting client **" + mds_evict_client + + client_reconnect + + if [ x"$2" = x"get" ]; then + wait $PID2 + fi + + wait $PID + rc=$? + + # check if the return value is allowed + [ $rc -eq $3 ] && rc=0 + + $LCTL set_param fail_loc=0 + return $rc +} + +# a lock is taken, unlock vs. cleanup_resource() race for destroying +# the ORIGINAL lock. +test_120_destroy() +{ + local PID + + flocks_test 5 set write sleep 4 $DIR/$tfile & + PID=$! + sleep 2 + + # let unlock to sleep in CP CB + $LCTL set_param fail_loc=$1 + sleep 4 + + # let cleanup to cleep in CP CB + mds_evict_client + + client_reconnect + + wait $PID + rc=$? + + $LCTL set_param fail_loc=0 + return $rc +} + +test_120() { + flock_is_enabled || { skip "mount w/o flock enabled" && return; } + touch $DIR/$tfile + + test_120_reply "CLEANUP" set 5 || error "SET race failed" + test_120_reply "CLEANUP" get 5 || error "GET race failed" + test_120_reply "CLEANUP" unlock 5 || error "UNLOCK race failed" + + test_120_reply "REPLY" set 5 || error "SET race failed" + test_120_reply "REPLY" get 5 || error "GET race failed" + test_120_reply "REPLY" unlock 5 || error "UNLOCK race failed" + + # DEADLOCK tests + test_120_reply "DEADLOCK CLEANUP" set 5 || error "DEADLOCK race failed" + test_120_reply "DEADLOCK REPLY" set 35 || error "DEADLOCK race failed" + + test_120_destroy 0x320 || error "unlock-cleanup race failed" +} +run_test 120 "flock race: completion vs. evict" + +test_113() { + local BEFORE=$(date +%s) + local EVICT + + # modify dir so that next revalidate would not obtain UPDATE lock + touch $DIR + + # drop 1 reply with UPDATE lock, + # resend should not create 2nd lock on server + mcreate $DIR/$tfile || error "mcreate failed: $?" + drop_ldlm_reply_once "stat $DIR/$tfile" || error "stat failed: $?" + + # 2 BL AST will be sent to client, both must find the same lock, + # race them to not get EINVAL for 2nd BL AST + #define OBD_FAIL_LDLM_PAUSE_CANCEL2 0x31f + $LCTL set_param fail_loc=0x8000031f + + $LCTL set_param ldlm.namespaces.*.early_lock_cancel=0 > /dev/null + chmod 0777 $DIR/$tfile || error "chmod failed: $?" + $LCTL set_param ldlm.namespaces.*.early_lock_cancel=1 > /dev/null + + # let the client reconnect + client_reconnect + EVICT=$($LCTL get_param mdc.$FSNAME-MDT*.state | + awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') + + [ -z "$EVICT" ] || [[ $EVICT -le $BEFORE ]] || error "eviction happened" +} +run_test 113 "ldlm enqueue dropped reply should not cause deadlocks" + complete $SECONDS check_and_cleanup_lustre exit_status