X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Fsanity.sh;h=6ef903ceda9eb5e6b8189b5aaa535ebc93f2cdeb;hb=742597c1aa7f4f0a021866fedf446d174f53e500;hp=260023012f3d37f6bd24522fb341376501816e7b;hpb=b66235ae873564b31027b03903ba15230ba5c7dc;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 2600230..6ef903c 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -7,13 +7,13 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: 4900 4900 2108 9789 3637 9789 3561 5188/5749 10764 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27o 27q 42a 42b 42c 42d 45 68 75"} +# bug number for skipped test: 4900 4900 2108 9789 3637 9789 3561 13310 10764 +ALWAYS_EXCEPT=" 27o 27q 42a 42b 42c 42d 45 74b 75 $SANITY_EXCEPT" # bug number for skipped test: 2108 9789 3637 9789 3561 5188/5749 1443 #ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27m 42a 42b 42c 42d 45 68 76"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 27m 36f 36g 51b 51c 63 64b 71 73 101 115" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="24o 27m 36f 36g 51b 51c 60c 63 64b 68 71 73 78 101 103 115 120g" # Tests that fail on uml CPU=`awk '/model/ {print $4}' /proc/cpuinfo` @@ -60,6 +60,8 @@ DIRECTIO=${DIRECTIO:-directio} ACCEPTOR_PORT=${ACCEPTOR_PORT:-988} UMOUNT=${UMOUNT:-"umount -d"} STRIPES_PER_OBJ=-1 +CHECK_GRANT=${CHECK_GRANT:-"yes"} +GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} if [ $UID -ne 0 ]; then echo "Warning: running as non-root uid $UID" @@ -76,49 +78,36 @@ else fi fi -SANITYLOG=${SANITYLOG:-/tmp/sanity.log} - export NAME=${NAME:-local} SAVE_PWD=$PWD +CLEANUP=${CLEANUP:-:} +SETUP=${SETUP:-:} +TRACE=${TRACE:-""} LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} -if [ ! -z "$USING_KRB5" ]; then +if $GSS_KRB5; then $RUNAS krb5_login.sh || exit 1 $RUNAS -u $(($RUNAS_ID + 1)) krb5_login.sh || exit 1 fi +SANITYLOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log} +FAIL_ON_ERROR=false + cleanup() { echo -n "cln.." cleanupall ${FORCE} $* || { echo "FAILed to clean up"; exit 20; } } -CLEANUP=${CLEANUP:-:} - setup() { echo -n "mnt.." load_modules setupall || exit 10 echo "done" } -SETUP=${SETUP:-:} - -log() { - echo "$*" - $LCTL mark "$*" 2> /dev/null || true -} - -trace() { - log "STARTING: $*" - strace -o $TMP/$1.strace -ttt $* - RC=$? - log "FINISHED: $*: rc $RC" - return 1 -} -TRACE=${TRACE:-""} check_kernel_version() { VERSION_FILE=$LPROC/version @@ -131,118 +120,14 @@ check_kernel_version() { return 1 } -_basetest() { - echo $* -} - -basetest() { - IFS=abcdefghijklmnopqrstuvwxyz _basetest $1 -} - -run_one() { - if ! grep -q $DIR /proc/mounts; then - $SETUP - fi - testnum=$1 - message=$2 - BEFORE=`date +%s` - log "== test $testnum: $message= `date +%H:%M:%S` ($BEFORE)" - export TESTNAME=test_$testnum - export tfile=f${testnum} - export tdir=d${base} - test_${testnum} || error "exit with rc=$?" - unset TESTNAME - pass "($((`date +%s` - $BEFORE))s)" - cd $SAVE_PWD - $CLEANUP -} - -build_test_filter() { - [ "$ALWAYS_EXCEPT$EXCEPT$SANITY_EXCEPT" ] && \ - echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITY_EXCEPT`" - - for O in $ONLY; do - eval ONLY_${O}=true - done - for E in $EXCEPT $ALWAYS_EXCEPT $SANITY_EXCEPT; do - eval EXCEPT_${E}=true - done -} - -_basetest() { - echo $* -} - -basetest() { - IFS=abcdefghijklmnopqrstuvwxyz _basetest $1 -} - -run_test() { - export base=`basetest $1` - if [ "$ONLY" ]; then - testname=ONLY_$1 - if [ ${!testname}x != x ]; then - run_one $1 "$2" - return $? - fi - testname=ONLY_$base - if [ ${!testname}x != x ]; then - run_one $1 "$2" - return $? - fi - echo -n "." - return 0 - fi - testname=EXCEPT_$1 - if [ ${!testname}x != x ]; then - TESTNAME=test_$1 skip "skipping excluded test $1" - return 0 - fi - testname=EXCEPT_$base - if [ ${!testname}x != x ]; then - TESTNAME=test_$1 skip "skipping excluded test $1 (base $base)" - return 0 - fi - run_one $1 "$2" - return $? -} +if [ "$ONLY" == "cleanup" ]; then + sh llmountcleanup.sh + exit 0 +fi [ "$SANITYLOG" ] && rm -f $SANITYLOG || true -error() { - sysctl -w lustre.fail_loc=0 - log "$0: FAIL: $TESTNAME $@" - $LCTL dk $TMP/lustre-log-$TESTNAME.log - if [ "$SANITYLOG" ]; then - echo "$0: FAIL: $TESTNAME $@" >> $SANITYLOG - else - exit 1 - fi - sysctl -w lustre.fail_loc=0 -} - -pass() { - echo PASS $@ -} - -skip () { - log "$0: SKIP: $TESTNAME $@" - [ "$SANITYLOG" ] && echo "$0: SKIP: $TESTNAME $@" >> $SANITYLOG - -} - -mounted_lustre_filesystems() { - awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts -} - -MOUNTED="`mounted_lustre_filesystems`" -if [ -z "$MOUNTED" ]; then - formatall - setupall - MOUNTED="`mounted_lustre_filesystems`" - [ -z "$MOUNTED" ] && error "NAME=$NAME not mounted" - I_MOUNTED=yes -fi +check_and_setup_lustre DIR=${DIR:-$MOUNT} [ -z "`echo $DIR | grep $MOUNT`" ] && echo "$DIR not in $MOUNT" && exit 99 @@ -253,12 +138,13 @@ STRIPECOUNT=`cat $LPROC/lov/$LOVNAME/stripecount` STRIPESIZE=`cat $LPROC/lov/$LOVNAME/stripesize` ORIGFREE=`cat $LPROC/lov/$LOVNAME/kbytesavail` MAXFREE=${MAXFREE:-$((200000 * $OSTCOUNT))} -MDS=$(\ls $LPROC/mdt 2> /dev/null | grep -v num_refs | tail -n 1) [ -f $DIR/d52a/foo ] && chattr -a $DIR/d52a/foo [ -f $DIR/d52b/foo ] && chattr -i $DIR/d52b/foo rm -rf $DIR/[Rdfs][1-9]* +check_runas_id $RUNAS_ID $RUNAS + build_test_filter if [ "${ONLY}" = "MOUNT" ] ; then @@ -566,6 +452,15 @@ test_17d() { } run_test 17d "symlinks: create dangling ========================" +test_17e() { + mkdir -p $DIR/$tdir + local foo=$DIR/$tdir/$tfile + ln -s $foo $foo || error "create symlink failed" + ls -l $foo || error "ls -l failed" + ls $foo && error "ls not failed" || true +} +run_test 17e "symlinks: create recursive symlink (should return error) ====" + test_17f() { mkdir -p $DIR/d17f ln -s 1234567890/2234567890/3234567890/4234567890 $DIR/d17f/111 @@ -635,7 +530,6 @@ run_test 21 "write to dangling link ============================" test_22() { WDIR=$DIR/$tdir - mkdir $WDIR chown $RUNAS_ID $WDIR (cd $WDIR || error "cd $WDIR failed"; $RUNAS tar cf - /etc/hosts /etc/sysconfig/network | \ @@ -903,15 +797,15 @@ run_test 26e "unlink multiple component recursive symlink ======" # recursive symlinks (bug 7022) test_26f() { - mkdir $DIR/$tfile || error "mkdir $DIR/$tfile failed" - cd $DIR/$tfile || error "cd $DIR/$tfile failed" - mkdir -p $tdir/bar1 || error "mkdir $tdir/bar1 failed" + mkdir $DIR/$tdir/$tfile || error "mkdir $DIR/$tdir/$tfile failed" + cd $DIR/$tdir/$tfile || error "cd $DIR/$tdir/$tfile failed" + mkdir -p lndir/bar1 || error "mkdir lndir/bar1 failed" mkdir $tfile || error "mkdir $tfile failed" cd $tfile || error "cd $tfile failed" ln -s .. dotdot || error "ln dotdot failed" - ln -s dotdot/$tdir $tdir || error "ln $tdir failed" - cd ../.. || error "cd ../.. failed" - output=`ls $tfile/$tfile/$tdir/bar1` + ln -s dotdot/lndir lndir || error "ln lndir failed" + cd $DIR/$tdir || error "cd $DIR/$tdir failed" + output=`ls $tfile/$tfile/lndir/bar1` [ "$output" = bar1 ] && error "unexpected output" rm -r $tfile || error "rm $tfile failed" $CHECKSTAT -a $DIR/$tfile || error "$tfile not gone" @@ -1070,8 +964,8 @@ exhaust_all_precreations() { } test_27n() { - [ "$OSTCOUNT" -lt "2" -o -z "$MDS" ] && \ - skip "too few OSTs, or remote MDS" && return + [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return + remote_mds && skip "remote MDS" && return reset_enospc rm -f $DIR/d27/f27n @@ -1084,8 +978,8 @@ test_27n() { run_test 27n "create file with some full OSTs ==================" test_27o() { - [ "$OSTCOUNT" -lt "2" -o -z "$MDS" ] && \ - skip "too few OSTs, or remote MDS" && return + [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return + remote_mds && skip "remote MDS" && return reset_enospc rm -f $DIR/d27/f27o @@ -1100,8 +994,8 @@ test_27o() { run_test 27o "create file with all full OSTs (should error) ====" test_27p() { - [ "$OSTCOUNT" -lt "2" -o -z "$MDS" ] && \ - skip "too few OSTs, or remote MDS" && return + [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return + remote_mds && skip "remote MDS" && return reset_enospc rm -f $DIR/d27/f27p @@ -1119,8 +1013,8 @@ test_27p() { run_test 27p "append to a truncated file with some full OSTs ===" test_27q() { - [ "$OSTCOUNT" -lt "2" -o -z "$MDS" ] && \ - skip "too few OSTs, or remote MDS" && return + [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return + remote_mds && skip "remote MDS" && return reset_enospc rm -f $DIR/d27/f27q @@ -1139,8 +1033,8 @@ test_27q() { run_test 27q "append to truncated file with all OSTs full (should error) ===" test_27r() { - [ "$OSTCOUNT" -lt "2" -o -z "$MDS" ] && \ - skip "too few OSTs, or remote MDS" && return + [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return + remote_mds && skip "remote MDS" && return reset_enospc rm -f $DIR/d27/f27r @@ -1169,7 +1063,53 @@ test_27t() { # bug 10864 } run_test 27t "check that utils parse path correctly" -test_27x() { # bug 10997 +test_27u() { # bug 4900 + [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return + remote_mds && skip "remote MDS" && return + + #define OBD_FAIL_MDS_OSC_PRECREATE 0x13d + + sysctl -w lustre.fail_loc=0x13d + mkdir -p $DIR/d27u + createmany -o $DIR/d27u/t- 1000 + sysctl -w lustre.fail_loc=0 + + $LFS getstripe $DIR/d27u > $TMP/files + OBJS=`cat $TMP/files | awk -vobjs=0 '($1 == 0) { objs += 1 } END { print objs;}'` + unlinkmany $DIR/d27u/t- 1000 + [ $OBJS -gt 0 ] && \ + error "Found $OBJS objects were created on OST-0" || pass +} +run_test 27u "skip object creation on OSC w/o objects ==========" + +test_27v() { # bug 4900 + [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return + remote_mds && skip "remote MDS" && return + + exhaust_all_precreations + + mkdir -p $DIR/$tdir + lfs setstripe $DIR/$tdir 0 -1 1 # 1 stripe / file + + touch $DIR/$tdir/$tfile + #define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 + sysctl -w lustre.fail_loc=0x705 + START=`date +%s` + for F in `seq 1 32`; do + touch $DIR/$tdir/$tfile.$F + done + sysctl -w lustre.fail_loc=0 + + FINISH=`date +%s` + TIMEOUT=`sysctl -n lustre.timeout` + [ $((FINISH - START)) -ge $((TIMEOUT / 2)) ] && \ + error "$FINISH - $START >= $TIMEOUT / 2" + + reset_enospc +} +run_test 27v "skip object creation on slow OST =================" + +test_27w() { # bug 10997 mkdir -p $DIR/d27w || error "mkdir failed" $LSTRIPE $DIR/d27w/f0 -s 65536 || error "lstripe failed" size=`$LSTRIPEINFO $DIR/d27w/f0 | awk {'print $1'}` @@ -1185,7 +1125,7 @@ test_27x() { # bug 10997 [ $index -ne $offset ] && error "stripe offset $index != $offset" || true done } -run_test 27x "check lfs setstripe -c -s -i options =============" +run_test 27w "check lfs setstripe -c -s -i options =============" test_28() { mkdir $DIR/d28 @@ -1740,9 +1680,7 @@ test_36f() { } run_test 36f "utime on file racing with OST BRW write ==========" -if [ -d $LPROC/obdfilter ]; then -export FMD_MAX_AGE=`cat $LPROC/obdfilter/*/client_cache_seconds | head -n 1` -fi +export FMD_MAX_AGE=`do_facet ost1 cat $LPROC/obdfilter/*/client_cache_seconds | head -n 1` test_36g() { [ -z "$FMD_MAX_AGE" ] && skip "skip test for remote OST" && return FMD_BEFORE="`awk '/ll_fmd_cache/ { print $2 }' /proc/slabinfo`" @@ -1950,11 +1888,14 @@ test_42d() { run_test 42d "test complete truncate of file with cached dirty data" test_43() { - mkdir $DIR/$tdir cp -p /bin/ls $DIR/$tdir/$tfile - exec 9>> $DIR/$tdir/$tfile + multiop $DIR/$tdir/$tfile Ow_c & + pid=$! + # give multiop a chance to open + sleep 1 + $DIR/$tdir/$tfile && error || true - exec 9<&- + kill -USR1 $pid } run_test 43 "execution of file opened for write should return -ETXTBSY" @@ -1996,7 +1937,7 @@ run_test 43c "md5sum of copy into lustre========================" test_44() { [ "$OSTCOUNT" -lt "2" ] && skip "skipping 2-stripe test" && return dd if=/dev/zero of=$DIR/f1 bs=4k count=1 seek=1023 - dd if=$DIR/f1 bs=4k count=1 + dd if=$DIR/f1 bs=4k count=1 > /dev/null } run_test 44 "zero length read from a sparse stripe =============" @@ -2111,8 +2052,8 @@ test_48a() { # bug 2399 touch .foo || error "'touch .foo' failed after recreating cwd" mkdir .bar || error "'mkdir .foo' failed after recreating cwd" fi - ls . || error "'ls .' failed after recreating cwd" - ls .. || error "'ls ..' failed after removing cwd" + ls . > /dev/null || error "'ls .' failed after recreating cwd" + ls .. > /dev/null || error "'ls ..' failed after removing cwd" cd . || error "'cd .' failed after recreating cwd" mkdir . && error "'mkdir .' worked after recreating cwd" rmdir . && error "'rmdir .' worked after recreating cwd" @@ -2132,9 +2073,9 @@ test_48b() { # bug 2399 touch .foo && error "'touch .foo' worked after removing cwd" mkdir .foo && error "'mkdir .foo' worked after removing cwd" fi - ls . && error "'ls .' worked after removing cwd" - ls .. || error "'ls ..' failed after removing cwd" - cd . && error "'cd .' worked after removing cwd" + ls . > /dev/null && error "'ls .' worked after removing cwd" + ls .. > /dev/null || error "'ls ..' failed after removing cwd" + is_patchless || ( cd . && error "'cd .' worked after removing cwd" ) mkdir . && error "'mkdir .' worked after removing cwd" rmdir . && error "'rmdir .' worked after removing cwd" ln -s . foo && error "'ln -s .' worked after removing cwd" @@ -2157,7 +2098,7 @@ test_48c() { # bug 2350 fi $TRACE ls . && error "'ls .' worked after removing cwd" $TRACE ls .. || error "'ls ..' failed after removing cwd" - $TRACE cd . && error "'cd .' worked after removing cwd" + is_patchless || ( $TRACE cd . && error "'cd .' worked after removing cwd" ) $TRACE mkdir . && error "'mkdir .' worked after removing cwd" $TRACE rmdir . && error "'rmdir .' worked after removing cwd" $TRACE ln -s . foo && error "'ln -s .' worked after removing cwd" @@ -2181,11 +2122,11 @@ test_48d() { # bug 2350 fi $TRACE ls . && error "'ls .' worked after removing parent" $TRACE ls .. && error "'ls ..' worked after removing parent" - $TRACE cd . && error "'cd .' worked after recreate parent" + is_patchless || ( $TRACE cd . && error "'cd .' worked after recreate parent" ) $TRACE mkdir . && error "'mkdir .' worked after removing parent" $TRACE rmdir . && error "'rmdir .' worked after removing parent" $TRACE ln -s . foo && error "'ln -s .' worked after removing parent" - $TRACE cd .. && error "'cd ..' worked after removing parent" || true + is_patchless || ( $TRACE cd .. && error "'cd ..' worked after removing parent" || true ) } run_test 48d "Access removed parent subdir (should return errors)" @@ -2228,6 +2169,7 @@ test_51() { FNUM=$(($FNUM + 1)) echo -n "+" done + echo ls -l $DIR/d51 > /dev/null || error } run_test 51 "special situations: split htree with empty entry ==" @@ -2325,6 +2267,8 @@ test_52b() { run_test 52b "immutable flag test (should return errors) =======" test_53() { + remote_mds && skip "remote MDS" && return + # only test MDT0000 for i in `ls -d $LPROC/osc/*-osc-MDT0000 2> /dev/null` ; do ostname=`basename $i | cut -d - -f 1-2` @@ -2403,7 +2347,7 @@ test_54e() { check_kernel_version 46 || return 0 f="$DIR/f54e" string="aaaaaa" - mknod $f c 4 0 + mknod $f c 5 0 echo $string > $f || error } run_test 54e "console/tty device works in lustre ======================" @@ -2504,6 +2448,25 @@ setup_56() { fi } +setup_56_special() { + LOCAL_NUMFILES=$1 + LOCAL_NUMDIRS=$2 + TDIR=$DIR/${tdir}g + setup_56 $1 $2 + if [ ! -e "$TDIR/loop1b" ] ; then + for i in `seq 1 $LOCAL_NUMFILES` ; do + mknod $TDIR/loop${i}b b 7 $i + mknod $TDIR/null${i}c c 1 3 + ln -s $TDIR/file1 $TDIR/link${i}l + done + for i in `seq 1 $LOCAL_NUMDIRS` ; do + mknod $TDIR/dir$i/loop${i}b b 7 $i + mknod $TDIR/dir$i/null${i}c c 1 3 + ln -s $TDIR/dir$i/file1 $TDIR/dir$i/link${i}l + done + fi +} + test_56g() { $LSTRIPE -d $DIR @@ -2536,9 +2499,86 @@ test_56h() { } run_test 56h "check lfs find ! -name =============================" +test_56i() { + tdir=${tdir}i + mkdir -p $DIR/$tdir + UUID=`$GETSTRIPE $DIR/$tdir | awk '/0: / { print $2 }'` + OUT="`$LFIND -ost $UUID $DIR/$tdir`" + [ "$OUT" ] && error "$LFIND returned directory '$OUT'" || true +} +run_test 56i "check 'lfs find -ost UUID' skips directories =======" + +test_56j() { + setup_56_special $NUMFILES $NUMDIRS + + EXPECTED=$((NUMDIRS+1)) + NUMS=`$LFIND -type d $DIR/${tdir}g | wc -l` + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -type d $DIR/${tdir}g wrong: found $NUMS, expected $EXPECTED" +} +run_test 56j "check lfs find -type d =============================" + +test_56k() { + setup_56_special $NUMFILES $NUMDIRS + + EXPECTED=$(((NUMDIRS+1) * NUMFILES)) + NUMS=`$LFIND -type f $DIR/${tdir}g | wc -l` + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -type f $DIR/${tdir}g wrong: found $NUMS, expected $EXPECTED" +} +run_test 56k "check lfs find -type f =============================" + +test_56l() { + setup_56_special $NUMFILES $NUMDIRS + + EXPECTED=$((NUMDIRS + NUMFILES)) + NUMS=`$LFIND -type b $DIR/${tdir}g | wc -l` + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -type b $DIR/${tdir}g wrong: found $NUMS, expected $EXPECTED" +} +run_test 56l "check lfs find -type b =============================" + +test_56m() { + setup_56_special $NUMFILES $NUMDIRS + + EXPECTED=$((NUMDIRS + NUMFILES)) + NUMS=`$LFIND -type c $DIR/${tdir}g | wc -l` + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -type c $DIR/${tdir}g wrong: found $NUMS, expected $EXPECTED" +} +run_test 56m "check lfs find -type c =============================" + +test_56n() { + setup_56_special $NUMFILES $NUMDIRS + + EXPECTED=$((NUMDIRS + NUMFILES)) + NUMS=`$LFIND -type l $DIR/${tdir}g | wc -l` + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -type l $DIR/${tdir}g wrong: found $NUMS, expected $EXPECTED" +} +run_test 56n "check lfs find -type l =============================" + +test_56o() { + setup_56 $NUMFILES $NUMDIRS + TDIR=$DIR/${tdir}g + + utime $TDIR/file1 > /dev/null || error + utime $TDIR/file2 > /dev/null || error + utime $TDIR/dir1 > /dev/null || error + utime $TDIR/dir2 > /dev/null || error + utime $TDIR/dir1/file1 > /dev/null || error + + EXPECTED=5 + NUMS=`$LFIND -mtime +1 $TDIR | wc -l` + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -mtime $TDIR wrong: found $NUMS, expected $EXPECTED" +} +run_test 56o "check lfs find -mtime for old files ==========================" + test_57a() { # note test will not do anything if MDS is not local - [ -z "$MDS" ] && skip "skipping test for remote MDS" && return + remote_mds && skip "remote MDS" && return + for DEV in `cat $LPROC/mds/*/mntdev`; do dumpe2fs -h $DEV > $TMP/t57a.dump || error "can't access $DEV" DEVISIZE=`awk '/Inode size:/ { print $3 }' $TMP/t57a.dump` @@ -2673,7 +2713,7 @@ run_test 63 "Verify oig_wait interruption does not crash =======" # bug 2248 - async write errors didn't return to application on sync # bug 3677 - async write errors left page locked test_63b() { - DBG_SAVE="`sysctl -n lnet.debug`" + debugsave sysctl -w lnet.debug=-1 # ensure we have a grant to do async writes @@ -2683,15 +2723,11 @@ test_63b() { #define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 sysctl -w lustre.fail_loc=0x80000406 multiop $DIR/$tfile Owy && \ - $LCTL dk /tmp/test63b.debug && \ - sysctl -w lnet.debug="$DBG_SAVE" && \ error "sync didn't return ENOMEM" sync; sleep 2; sync # do a real sync this time to flush page grep locked $LPROC/llite/*/dump_page_cache && \ - $LCTL dk /tmp/test63b.debug && \ - sysctl -w lnet.debug="$DBG_SAVE" && \ error "locked page left in cache after async error" || true - sysctl -w lnet.debug="$DBG_SAVE" + debugrestore } run_test 63b "async write errors should be returned to fsync ===" @@ -2734,22 +2770,21 @@ test_65c() { } run_test 65c "directory setstripe $(($STRIPESIZE * 4)) 1 $(($OSTCOUNT - 1))" -if [ $STRIPECOUNT -eq 0 ]; then - sc=1 -elif [ $STRIPECOUNT -gt 160 ]; then -#LOV_MAX_STRIPE_COUNT is 160, 4294967295(-1) is included. - [ $OSTCOUNT -gt 160 ] && sc=160 || sc=$(($OSTCOUNT - 1)) -else - sc=$(($STRIPECOUNT - 1)) -fi - test_65d() { mkdir -p $DIR/d65 + if [ $STRIPECOUNT -le 0 ]; then + sc=1 + elif [ $STRIPECOUNT -gt 160 ]; then +#LOV_MAX_STRIPE_COUNT is 160 + [ $OSTCOUNT -gt 160 ] && sc=160 || sc=$(($OSTCOUNT - 1)) + else + sc=$(($STRIPECOUNT - 1)) + fi $SETSTRIPE $DIR/d65 $STRIPESIZE -1 $sc || error "setstripe" touch $DIR/d65/f4 $DIR/d65/f5 $LVERIFY $DIR/d65 $DIR/d65/f4 $DIR/d65/f5 || error "lverify failed" } -run_test 65d "directory setstripe $STRIPESIZE -1 $sc ==============" +run_test 65d "directory setstripe $STRIPESIZE -1 stripe_count ==============" test_65e() { mkdir -p $DIR/d65 @@ -2796,7 +2831,7 @@ test_65j() { # bug6367 cleanup -f || error "failed to unmount" setup fi - $SETSTRIPE -d $MOUNT + $SETSTRIPE -d $MOUNT || error "setstripe failed" } run_test 65j "set default striping on root directory (bug 6367)=" @@ -2809,7 +2844,7 @@ test_65k() { # bug11679 echo $OSC "is activate" do_facet mds lctl --device %$OSC activate done - mkdir -p $DIR/$tdir + do_facet client mkdir -p $DIR/$tdir for INACTIVE_OSC in $MDS_OSCS; do echo $INACTIVE_OSC "is Deactivate:" do_facet mds lctl --device %$INACTIVE_OSC deactivate @@ -2818,11 +2853,11 @@ test_65k() { # bug11679 STRIPE_INDEX=`do_facet mds cat $LPROC/lov/*md*/target_obd | grep $STRIPE_OST | awk -F: '{print $1}'` echo "$SETSTRIPE $DIR/$tdir/${STRIPE_INDEX} 0 ${STRIPE_INDEX} 1" - do_facet mds $SETSTRIPE $DIR/$tdir/${STRIPE_INDEX} 0 ${STRIPE_INDEX} 1 + do_facet client $SETSTRIPE $DIR/$tdir/${STRIPE_INDEX} 0 ${STRIPE_INDEX} 1 RC=$? [ $RC -ne 0 ] && error "setstripe should have succeeded" done - rm -f $DIR/$tdir/* + do_facet client rm -f $DIR/$tdir/* echo $INACTIVE_OSC "is Activate." do_facet mds lctl --device %$INACTIVE_OSC activate done @@ -2830,9 +2865,9 @@ test_65k() { # bug11679 run_test 65k "validate manual striping works properly with deactivated OSCs" test_65l() { # bug 12836 - mkdir -p $DIR/$tdir - $LFS setstripe $DIR/$tdir 65536 -1 -1 - $LFS find -mtime -1 $DIR + mkdir -p $DIR/$tdir/test_dir + $LFS setstripe $DIR/$tdir/test_dir 65536 -1 -1 + $LFS find -mtime -1 $DIR/$tdir >/dev/null } run_test 65l "lfs find on -1 stripe dir ========================" @@ -2852,12 +2887,14 @@ test_67() { } run_test 67 "security test =====================================" +LLOOP= cleanup_68() { trap 0 - if [ "$LOOPDEV" ]; then - swapoff $LOOPDEV || error "swapoff failed" - losetup -d $LOOPDEV || error "losetup -d failed" - unset LOOPDEV LOOPNUM + if [ ! -z "$LLOOP" ]; then + swapoff $LLOOP || error "swapoff failed" + $LCTL blockdev_detach $LLOOP || error "detach failed" + rm -f $LLOOP + unset LLOOP fi rm -f $DIR/f68 } @@ -2870,6 +2907,7 @@ swap_used() { swapon -s | awk '($1 == "'$1'") { print $4 }' } + # excercise swapping to lustre by adding a high priority swapfile entry # and then consuming memory until it is used. test_68() { @@ -2877,20 +2915,30 @@ test_68() { grep -q obdfilter $LPROC/devices && \ skip "local OST" && return - find_loop_dev - dd if=/dev/zero of=$DIR/f68 bs=64k count=1024 + grep -q llite_lloop /proc/modules + [ $? -ne 0 ] && skip "can't find module llite_lloop" && return + + [ -z "`$LCTL list_nids | grep -v tcp`" ] && \ + skip "can't reliably test swap with TCP" && return + + MEMTOTAL=`meminfo MemTotal` + NR_BLOCKS=$((MEMTOTAL>>8)) + [[ $NR_BLOCKS -le 2048 ]] && NR_BLOCKS=2048 + + LLOOP=$TMP/lloop.`date +%s`.`date +%N` + dd if=/dev/zero of=$DIR/f68 bs=64k seek=$NR_BLOCKS count=1 + mkswap $DIR/f68 + + $LCTL blockdev_attach $DIR/f68 $LLOOP || error "attach failed" trap cleanup_68 EXIT - losetup $LOOPDEV $DIR/f68 || error "losetup $LOOPDEV failed" - mkswap $LOOPDEV - swapon -p 32767 $LOOPDEV || error "swapon $LOOPDEV failed" + swapon -p 32767 $LLOOP || error "swapon $LLOOP failed" - echo "before: `swapon -s | grep $LOOPDEV`" - KBFREE=`meminfo MemTotal` - $MEMHOG $KBFREE || error "error allocating $KBFREE kB" - echo "after: `swapon -s | grep $LOOPDEV`" - SWAPUSED=`swap_used $LOOPDEV` + echo "before: `swapon -s | grep $LLOOP`" + $MEMHOG $MEMTOTAL || error "error allocating $MEMTOTAL kB" + echo "after: `swapon -s | grep $LLOOP`" + SWAPUSED=`swap_used $LLOOP` cleanup_68 @@ -2903,8 +2951,7 @@ run_test 68 "support swapping to Lustre ========================" test_69() { [ $(grep -c obdfilter $LPROC/devices) -eq 0 ] && \ skip "skipping test for remote OST" && return - [ ! -z "$USING_KRB5" ] && \ - skip "gss with bulk security will triger oops. re-enable this after b10091 get fixed" && return + $GSS && skip "gss with bulk security will triger oops. re-enable this after b10091 get fixed" && return f="$DIR/$tfile" touch $f @@ -3008,18 +3055,32 @@ test_73() { } run_test 73 "multiple MDC requests (should not deadlock)" -test_74() { # bug 6149, 6184 +test_74a() { # bug 6149, 6184 + #define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e + # + # very important to OR with OBD_FAIL_ONCE (0x80000000) -- otherwise it + # will spin in a tight reconnection loop + touch $DIR/f74a + sysctl -w lustre.fail_loc=0x8000030e + # get any lock that won't be difficult - lookup works. + ls $DIR/f74a + sysctl -w lustre.fail_loc=0 + true +} +run_test 74a "ldlm_enqueue freed-export error path, ls (shouldn't LBUG)" + +test_74b() { # bug 13310 #define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e # # very important to OR with OBD_FAIL_ONCE (0x80000000) -- otherwise it # will spin in a tight reconnection loop sysctl -w lustre.fail_loc=0x8000030e - # get any lock - touch $DIR/f74 + # get a "difficult" lock + touch $DIR/f74b sysctl -w lustre.fail_loc=0 true } -run_test 74 "ldlm_enqueue freed-export error path (shouldn't LBUG)" +run_test 74b "ldlm_enqueue freed-export error path, touch (shouldn't LBUG)" JOIN=${JOIN:-"lfs join"} F75=$DIR/f75 @@ -3173,8 +3234,8 @@ run_test 76 "destroy duplicate inodes in client inode cache ====" export ORIG_CSUM="" set_checksums() { - [ "$ORIG_CSUM" ]||ORIG_CSUM=`cat $LPROC/llite/*/checksum_pages|head -n1` - for f in $LPROC/llite/*/checksum_pages; do + [ "$ORIG_CSUM" ] || ORIG_CSUM=`cat $LPROC/osc/*/checksums | head -n1` + for f in $LPROC/osc/*/checksums; do echo $1 >> $f done @@ -3306,6 +3367,41 @@ test_78() { # bug 10901 } run_test 78 "handle large O_DIRECT writes correctly ============" +test_79() { # bug 12743 + [ $(grep -c obdfilter $LPROC/devices) -eq 0 ] && + skip "skipping test for remote OST" && return + + wait_delete_completed + + BKTOTAL=`awk 'BEGIN{total=0}; {total+=$1}; END{print total}' \ + $LPROC/obdfilter/*/kbytestotal` + BKFREE=`awk 'BEGIN{free=0}; {free+=$1}; END{print free}' \ + $LPROC/obdfilter/*/kbytesfree` + BKAVAIL=`awk 'BEGIN{avail=0}; {avail+=$1}; END{print avail}' \ + $LPROC/obdfilter/*/kbytesavail` + STRING=`df -P $MOUNT | tail -n 1 | awk '{print $2","$3","$4}'` + DFTOTAL=`echo $STRING | cut -d, -f1` + DFUSED=`echo $STRING | cut -d, -f2` + DFAVAIL=`echo $STRING | cut -d, -f3` + DFFREE=$(($DFTOTAL - $DFUSED)) + + ALLOWANCE=$((64 * $OSTCOUNT)) + + if [ $DFTOTAL -lt $(($BKTOTAL - $ALLOWANCE)) ] || + [ $DFTOTAL -gt $(($BKTOTAL + $ALLOWANCE)) ] ; then + error "df total($DFTOTAL) mismatch OST total($BKTOTAL)" + fi + if [ $DFFREE -lt $(($BKFREE - $ALLOWANCE)) ] || + [ $DFFREE -gt $(($BKFREE + $ALLOWANCE)) ] ; then + error "df free($DFFREE) mismatch OST free($BKFREE)" + fi + if [ $DFAVAIL -lt $(($BKAVAIL - $ALLOWANCE)) ] || + [ $DFAVAIL -gt $(($BKAVAIL + $ALLOWANCE)) ] ; then + error "df avail($DFAVAIL) mismatch OST avail($BKAVAIL)" + fi +} +run_test 79 "df report consistency check =======================" + # on the LLNL clusters, runas will still pick up root's $TMP settings, # which will not be writable for the runas user, and then you get a CVS # error message with a corrupt path string (CVS bug) and panic. @@ -3473,13 +3569,13 @@ setup_test102() { done done - cd .. + cd $DIR star -c f=$TMP/f102.tar $tdir SETUP_TEST102=yes } cleanup_test102() { - [ "SETUP_TEST102" = "YES" ] || return + [ "$SETUP_TEST102" = "yes" ] || return trap 0 rm -f $TMP/f102.tar rm -rf $DIR/$tdir @@ -3554,28 +3650,28 @@ test_102b() { $GETSTRIPE -v $testfile2 > $tmp_file local stripe_size=`grep "size" $tmp_file| awk '{print $2}'` local stripe_count=`grep "count" $tmp_file| awk '{print $2}'` - [ $stripe_size -eq 65536 ] || error "stripe size $stripe_size != 65536" - [ $stripe_count -eq 2 ] || error "stripe count $stripe_count != 2" + [ "$stripe_size" -eq 65536 ] || error "stripe size $stripe_size != 65536" + [ "$stripe_count" -eq 2 ] || error "stripe count $stripe_count != 2" } run_test 102b "getfattr/setfattr for trusted.lov EAs ============" test_102c() { - # b10930: get/set/list trusted.lov xattr - echo "get/set/list trusted.lov xattr ..." + # b10930: get/set/list lustre.lov xattr + echo "get/set/list lustre.lov xattr ..." [ "$OSTCOUNT" -lt "2" ] && skip "skipping 2-stripe test" && return mkdir -p $DIR/$tdir chown $RUNAS_ID $DIR/$tdir local testfile=$DIR/$tdir/$tfile $RUNAS $SETSTRIPE $testfile 65536 1 2 - $RUNAS getfattr -d -m "^trusted" $testfile 2> /dev/null | \ - grep "trusted.lov" || error "can't get trusted.lov from $testfile" + $RUNAS getfattr -d -m "^lustre" $testfile 2> /dev/null | \ + grep "lustre.lov" || error "can't get lustre.lov from $testfile" local testfile2=${testfile}2 - local value=`getfattr -n trusted.lov $testfile 2> /dev/null | \ - grep "trusted.lov" |sed -e 's/[^=]\+=//' ` + local value=`getfattr -n lustre.lov $testfile 2> /dev/null | \ + grep "lustre.lov" |sed -e 's/[^=]\+=//' ` $RUNAS $MCREATE $testfile2 - $RUNAS setfattr -n trusted.lov -v $value $testfile2 + $RUNAS setfattr -n lustre.lov -v $value $testfile2 local tmp_file=${testfile}3 $RUNAS $GETSTRIPE -v $testfile2 > $tmp_file local stripe_size=`grep "size" $tmp_file| awk '{print $2}'` @@ -3583,7 +3679,7 @@ test_102c() { [ $stripe_size -eq 65536 ] || error "stripe size $stripe_size != 65536" [ $stripe_count -eq 2 ] || error "stripe count $stripe_count != 2" } -run_test 102c "non-root getfattr/setfattr for trusted.lov EAs ===========" +run_test 102c "non-root getfattr/setfattr for lustre.lov EAs ===========" get_stripe_info() { stripe_size=0 @@ -3726,7 +3822,7 @@ run_test 102g "star copy files, keep osts ===========" run_acl_subtest() { - $SAVE_PWD/acl/run $SAVE_PWD/acl/$1.test + $LUSTRE/tests/acl/run $LUSTRE/tests/acl/$1.test return $? } @@ -3734,7 +3830,7 @@ test_103 () { [ "$UID" != 0 ] && skip "must run as root" && return [ -z "$(grep acl $LPROC/mdc/*-mdc-*/connect_flags)" ] && skip "must have acl enabled" && return [ -z "$(which setfacl 2>/dev/null)" ] && skip "could not find setfacl" && return - [ ! -z "$USING_KRB5" ] && skip "could not run under gss" && return + $GSS && skip "could not run under gss" && return SAVE_UMASK=`umask` umask 0022 @@ -3754,7 +3850,7 @@ test_103 () { # inheritance test got from HP echo "performing inheritance..." - cp $SAVE_PWD/acl/make-tree . || error + cp $LUSTRE/tests/acl/make-tree . || error chmod +x make-tree || error run_acl_subtest inheritance || error rm -f make-tree @@ -3816,7 +3912,6 @@ test_105c() { run_test 105c "lockf when mounted without -o flock test ========" test_106() { #bug 10921 - mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" $DIR/$tdir && error "exec $DIR/$tdir succeeded" chmod 777 $DIR/$tdir || error "chmod $DIR/$tdir failed" } @@ -3888,8 +3983,9 @@ test_115() { run_test 115 "verify dynamic thread creation====================" free_min_max () { - AVAIL=($(cat $LPROC/osc/*[oO][sS][cC]-[^M]*/kbytesavail)) - echo OST kbytes available: ${AVAIL[@]} + wait_delete_completed + AVAIL=($(cat $LPROC/osc/*[oO][sS][cC]-[^M]*/kbytesavail)) + echo OST kbytes available: ${AVAIL[@]} MAXI=0; MAXV=${AVAIL[0]} MINI=0; MINV=${AVAIL[0]} for ((i = 0; i < ${#AVAIL[@]}; i++)); do @@ -3907,8 +4003,7 @@ free_min_max () { test_116() { [ "$OSTCOUNT" -lt "2" ] && skip "$OSTCOUNT < 2 OSTs" && return - [ $(grep -c obdfilter $LPROC/devices) -eq 0 ] && - skip "remote MDS, skipping test" && return + remote_mds && skip "remote MDS" && return echo -n "Free space priority " cat $LPROC/lov/*-clilov-*/qos_prio_free @@ -4000,15 +4095,326 @@ test_117() # bug 10891 } run_test 117 "verify fsfilt_extend ==========" -test_118() #bug 11710 +# Reset async IO behavior after error case +reset_async() { + FILE=$DIR/reset_async + + # Ensure all OSCs are cleared + $LSTRIPE $FILE 0 -1 -1 + dd if=/dev/zero of=$FILE bs=64k count=$OSTCOUNT + sync + rm $FILE +} + +test_118a() #bug 11710 { - sync; sleep 1; sync - multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c; - dirty=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + reset_async + + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + return 1; + fi +} +run_test 118a "verify O_SYNC works ==========" + +test_118b() +{ + reset_async + + #define OBD_FAIL_OST_ENOENT 0x217 + do_facet ost sysctl -w lustre.fail_loc=0x217 + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + do_facet ost sysctl -w lustre.fail_loc=0 + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + return 1; + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + return 1; + fi + + echo "Dirty pages not leaked on ENOENT" + + # Due to the above error the OSC will issue all RPCs syncronously + # until a subsequent RPC completes successfully without error. + multiop $DIR/$tfile Ow4096yc + rm -f $DIR/$tfile + + return 0 +} +run_test 118b "Reclaim dirty pages on fatal error ==========" + +test_118c() +{ + reset_async + + #define OBD_FAIL_OST_EROFS 0x216 + do_facet ost sysctl -w lustre.fail_loc=0x216 + + # multiop should block due to fsync until pages are written + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & + MULTIPID=$! + sleep 1 + + if [[ `ps h -o comm -p $MULTIPID` != "multiop" ]]; then + error "Multiop failed to block on fsync, pid=$MULTIPID" + fi + + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $WRITEBACK -eq 0 ]]; then + error "No page in writeback, writeback=$WRITEBACK" + fi + + do_facet ost sysctl -w lustre.fail_loc=0 + wait $MULTIPID + RC=$? + if [[ $RC -ne 0 ]]; then + error "Multiop fsync failed, rc=$RC" + fi + + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "Dirty pages flushed via fsync on EROFS" + return 0 +} +run_test 118c "Fsync blocks on EROFS until dirty pages are flushed ==========" + +test_118d() +{ + reset_async + + #define OBD_FAIL_OST_BRW_PAUSE_BULK + do_facet ost sysctl -w lustre.fail_loc=0x214 + # multiop should block due to fsync until pages are written + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & + MULTIPID=$! + sleep 1 + + if [[ `ps h -o comm -p $MULTIPID` != "multiop" ]]; then + error "Multiop failed to block on fsync, pid=$MULTIPID" + fi + + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $WRITEBACK -eq 0 ]]; then + error "No page in writeback, writeback=$WRITEBACK" + fi + + wait $MULTIPID || error "Multiop fsync failed, rc=$?" + do_facet ost sysctl -w lustre.fail_loc=0 + + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "Dirty pages gaurenteed flushed via fsync" + return 0 +} +run_test 118d "Fsync validation inject a delay of the bulk ==========" + +test_118f() { + reset_async + + #define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a + sysctl -w lustre.fail_loc=0x8000040a + + # Should simulate EINVAL error which is fatal + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + sysctl -w lustre.fail_loc=0x0 + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + reset_async + return 0 +} +run_test 118f "Simulate unrecoverable OSC side error ==========" + +test_118g() { + reset_async + + #define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 + sysctl -w lustre.fail_loc=0x406 + + # simulate local -ENOMEM + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + + sysctl -w lustre.fail_loc=0 + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + reset_async + return 0 +} +run_test 118g "Don't stay in wait if we got local -ENOMEM ==========" + +test_118h() { + reset_async + + #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e + do_facet ost sysctl -w lustre.fail_loc=0x20e + # Should simulate ENOMEM error which is recoverable and should be handled by timeout + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + + do_facet ost sysctl -w lustre.fail_loc=0 + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + return 0 +} +run_test 118h "Verify timeout in handling recoverables errors ==========" + +test_118i() { + reset_async + + #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e + do_facet ost sysctl -w lustre.fail_loc=0x20e + + # Should simulate ENOMEM error which is recoverable and should be handled by timeout + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & + PID=$! + sleep 5 + do_facet ost sysctl -w lustre.fail_loc=0 + + wait $PID + RC=$? + if [[ $RC -ne 0 ]]; then + error "got error, but should be not, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + return 0 +} +run_test 118i "Fix error before timeout in recoverable error ==========" + +test_118j() { + reset_async + + #define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 + do_facet ost sysctl -w lustre.fail_loc=0x220 + + # return -EIO from OST + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + do_facet ost sysctl -w lustre.fail_loc=0x0 + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi - return $dirty + # in recoverable error on OST we want resend and stay until it finished + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + return 0 +} +run_test 118j "Simulate unrecoverable OST side error ==========" + +test_118k() +{ + #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e + do_facet ost sysctl -w lustre.fail_loc=0x20e + mkdir -p $DIR/$tdir + + for ((i=0;i<10;i++)); do + dd if=/dev/zero of=$DIR/$tdir/$tdir-$i bs=1M count=10 & + SLEEPPID=$! + sleep 0.500s + kill $SLEEPPID + wait $SLEEPPID + done + + sysctl -w lustre.fail_loc=0 } -run_test 118 "verify O_SYNC work" +run_test 118k "bio alloc -ENOMEM and IO TERM handling =========" test_119a() # bug 11737 { @@ -4038,8 +4444,79 @@ test_119b() # bug 11737 } run_test 119b "Sparse directIO read must return actual read amount" -test_119a() { - mkdir $DIR/$tdir +LDLM_POOL_CTL_RECALC=1 +LDLM_POOL_CTL_SHRINK=2 + +disable_pool_recalc() { + for NSD in $LPROC/ldlm/namespaces/*$1*; do + if test -f $NSD/pool/control; then + CONTROL=`cat $NSD/pool/control` + CONTROL=$((CONTROL & ~LDLM_POOL_CTL_RECALC)) + echo "$CONTROL" > $NSD/pool/control + fi + done +} + +enable_pool_recalc() { + for NSD in $LPROC/ldlm/namespaces/*$1*; do + if test -f $NSD/pool/control; then + CONTROL=`cat $NSD/pool/control` + CONTROL=$((CONTROL | LDLM_POOL_CTL_RECALC)) + echo "$CONTROL" > $NSD/pool/control + fi + done +} + +disable_pool_shrink() { + for NSD in $LPROC/ldlm/namespaces/*$1*; do + if test -f $NSD/pool/control; then + CONTROL=`cat $NSD/pool/control` + CONTROL=$((CONTROL & ~LDLM_POOL_CTL_SHRINK)) + echo "$CONTROL" > $NSD/pool/control + fi + done +} + +enable_pool_shrink() { + for NSD in $LPROC/ldlm/namespaces/*$1*; do + if test -f $NSD/pool/control; then + CONTROL=`cat $NSD/pool/control` + CONTROL=$((CONTROL | LDLM_POOL_CTL_SHRINK)) + echo "$CONTROL" > $NSD/pool/control + fi + done +} + +disable_pool() { + disable_pool_shrink $1 + disable_pool_recalc $1 +} + +enable_pool() { + enable_pool_shrink $1 + enable_pool_recalc $1 +} + +lru_resize_enable() +{ + enable_pool osc + enable_pool "filter-$FSNAME" + enable_pool mdc + enable_pool "mds-$FSNAME" +} + +lru_resize_disable() +{ + disable_pool osc + disable_pool "filter-$FSNAME" + disable_pool mdc + disable_pool "mds-$FSNAME" +} + +test_120a() { + [ -z "`grep early_lock_cancel $LPROC/mdc/*/connect_flags`" ] && \ + skip "no early lock cancel on server" && return 0 + lru_resize_disable cancel_lru_locks mdc stat $DIR/$tdir > /dev/null can1=`awk '/ldlm_cancel/ {print $2}' $LPROC/ldlm/services/ldlm_canceld/stats` @@ -4049,11 +4526,14 @@ test_119a() { blk2=`awk '/ldlm_bl_callback/ {print $2}' $LPROC/ldlm/services/ldlm_cbd/stats` [ $can1 -eq $can2 ] || error $((can2-can1)) "cancel RPC occured." [ $blk1 -eq $blk2 ] || error $((blk2-blk1)) "blocking RPC occured." + lru_resize_enable } -run_test 119a "Early Lock Cancel: mkdir test" +run_test 120a "Early Lock Cancel: mkdir test" -test_119b() { - mkdir $DIR/$tdir +test_120b() { + [ -z "`grep early_lock_cancel $LPROC/mdc/*/connect_flags`" ] && \ + skip "no early lock cancel on server" && return 0 + lru_resize_disable cancel_lru_locks mdc stat $DIR/$tdir > /dev/null can1=`awk '/ldlm_cancel/ {print $2}' $LPROC/ldlm/services/ldlm_canceld/stats` @@ -4063,10 +4543,14 @@ test_119b() { can2=`awk '/ldlm_cancel/ {print $2}' $LPROC/ldlm/services/ldlm_canceld/stats` [ $can1 -eq $can2 ] || error $((can2-can1)) "cancel RPC occured." [ $blk1 -eq $blk2 ] || error $((blk2-blk1)) "blocking RPC occured." + lru_resize_enable } -run_test 119b "Early Lock Cancel: create test" +run_test 120b "Early Lock Cancel: create test" -test_119c() { +test_120c() { + [ -z "`grep early_lock_cancel $LPROC/mdc/*/connect_flags`" ] && \ + skip "no early lock cancel on server" && return 0 + lru_resize_disable mkdir -p $DIR/$tdir/d1 $DIR/$tdir/d2 touch $DIR/$tdir/d1/f1 cancel_lru_locks mdc @@ -4078,10 +4562,14 @@ test_119c() { blk2=`awk '/ldlm_bl_callback/ {print $2}' $LPROC/ldlm/services/ldlm_cbd/stats` [ $can1 -eq $can2 ] || error $((can2-can1)) "cancel RPC occured." [ $blk1 -eq $blk2 ] || error $((blk2-blk1)) "blocking RPC occured." + lru_resize_enable } -run_test 119c "Early Lock Cancel: link test" +run_test 120c "Early Lock Cancel: link test" -test_119d() { +test_120d() { + [ -z "`grep early_lock_cancel $LPROC/mdc/*/connect_flags`" ] && \ + skip "no early lock cancel on server" && return 0 + lru_resize_disable touch $DIR/$tdir cancel_lru_locks mdc stat $DIR/$tdir > /dev/null @@ -4092,11 +4580,14 @@ test_119d() { blk2=`awk '/ldlm_bl_callback/ {print $2}' $LPROC/ldlm/services/ldlm_cbd/stats` [ $can1 -eq $can2 ] || error $((can2-can1)) "cancel RPC occured." [ $blk1 -eq $blk2 ] || error $((blk2-blk1)) "blocking RPC occured." + lru_resize_enable } -run_test 119d "Early Lock Cancel: setattr test" +run_test 120d "Early Lock Cancel: setattr test" -test_119e() { - mkdir $DIR/$tdir +test_120e() { + [ -z "`grep early_lock_cancel $LPROC/mdc/*/connect_flags`" ] && \ + skip "no early lock cancel on server" && return 0 + lru_resize_disable dd if=/dev/zero of=$DIR/$tdir/f1 count=1 cancel_lru_locks mdc cancel_lru_locks osc @@ -4109,10 +4600,14 @@ test_119e() { blk2=`awk '/ldlm_bl_callback/ {print $2}' $LPROC/ldlm/services/ldlm_cbd/stats` [ $can1 -eq $can2 ] || error $((can2-can1)) "cancel RPC occured." [ $blk1 -eq $blk2 ] || error $((blk2-blk1)) "blocking RPC occured." + lru_resize_enable } -run_test 119e "Early Lock Cancel: unlink test" +run_test 120e "Early Lock Cancel: unlink test" -test_119f() { +test_120f() { + [ -z "`grep early_lock_cancel $LPROC/mdc/*/connect_flags`" ] && \ + skip "no early lock cancel on server" && return 0 + lru_resize_disable mkdir -p $DIR/$tdir/d1 $DIR/$tdir/d2 dd if=/dev/zero of=$DIR/$tdir/d1/f1 count=1 dd if=/dev/zero of=$DIR/$tdir/d2/f2 count=1 @@ -4128,13 +4623,17 @@ test_119f() { blk2=`awk '/ldlm_bl_callback/ {print $2}' $LPROC/ldlm/services/ldlm_cbd/stats` [ $can1 -eq $can2 ] || error $((can2-can1)) "cancel RPC occured." [ $blk1 -eq $blk2 ] || error $((blk2-blk1)) "blocking RPC occured." + lru_resize_enable } -run_test 119f "Early Lock Cancel: rename test" +run_test 120f "Early Lock Cancel: rename test" -test_119g() { +test_120g() { + [ -z "`grep early_lock_cancel $LPROC/mdc/*/connect_flags`" ] && \ + skip "no early lock cancel on server" && return 0 + lru_resize_disable count=10000 echo create $count files - mkdir $DIR/$tdir + mkdir -p $DIR/$tdir cancel_lru_locks mdc cancel_lru_locks osc t0=`date +%s` @@ -4157,21 +4656,227 @@ test_119g() { echo total: $((can2-can1)) cancels, $((blk2-blk1)) blockings sleep 2 # wait for commitment of removal + lru_resize_enable +} +run_test 120g "Early Lock Cancel: performance test" + +test_121() { #bug #10589 + rm -rf $DIR/$tfile + writes=`dd if=/dev/zero of=$DIR/$tfile count=1 2>&1 | awk 'BEGIN { FS="+" } /out/ {print $1}'` +#define OBD_FAIL_LDLM_CANCEL_RACE 0x310 + sysctl -w lustre.fail_loc=0x310 + cancel_lru_locks osc > /dev/null + reads=`dd if=$DIR/$tfile of=/dev/null 2>&1 | awk 'BEGIN { FS="+" } /in/ {print $1}'` + sysctl -w lustre.fail_loc=0 + [ "$reads" -eq "$writes" ] || error "read" $reads "blocks, must be" $writes +} +run_test 121 "read cancel race =========" + +cmd_cancel_lru_locks() { + NS=$1 + test "x$NS" = "x" && NS="mdc" + for d in `find $LPROC/ldlm/namespaces | grep $NS`; do + if test -f $d/lru_size; then + cancel_lru_locks $d + fi + done +} + +test_124a() { + [ -z "`grep lru_resize $LPROC/mdc/*/connect_flags`" ] && \ + skip "no lru resize on server" && return 0 + cmd_cancel_lru_locks "mdc" + lru_resize_enable + + # we want to test main pool functionality, that is cancel based on SLV + # this is why shrinkers are disabled + disable_pool_shrink "mds-$FSNAME" + disable_pool_shrink mdc + + NR=2000 + mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + + # use touch to produce $NR new locks + log "create $NR files at $DIR/$tdir" + for ((i=0;i<$NR;i++)); do touch $DIR/$tdir/f$i; done + + NSDIR="" + LRU_SIZE=0 + for d in `find $LPROC/ldlm/namespaces | grep mdc-`; do + if test -f $d/lru_size; then + LRU_SIZE=`cat $d/lru_size` + if test $LRU_SIZE -gt 0; then + log "using $d namespace" + NSDIR=$d + break + fi + fi + done + + if test -z $NSDIR; then + skip "No cached locks created!" + return 0 + fi + + if test $LRU_SIZE -lt 100; then + skip "Not enough cached locks created!" + return 0 + fi + log "created $LRU_SIZE lock(s)" + + # we want to sleep 30s to not make test too long + SLEEP=30 + SLEEP_ADD=2 + + # we know that lru resize allows one client to hold $LIMIT locks for 10h + MAX_HRS=10 + + # get the pool limit + LIMIT=`cat $NSDIR/pool/limit` + + # calculate lock volume factor taking into account data set size and the + # rule that number of locks will be getting smaller durring sleep interval + # and we need to additionally enforce LVF to take this into account. + # Use $LRU_SIZE_B here to take into account real number of locks created + # in the case of CMD, LRU_SIZE_B != $NR in most of cases + LVF=$(($MAX_HRS * 60 * 60 * $LIMIT / $SLEEP)) + LRU_SIZE_B=$LRU_SIZE + log "make client drop locks $LVF times faster so that ${SLEEP}s is enough to cancel $LRU_SIZE_B lock(s)" + OLD_LVF=`cat $NSDIR/pool/lock_volume_factor` + echo "$LVF" > $NSDIR/pool/lock_volume_factor + log "sleep for $((SLEEP+SLEEP_ADD))s" + sleep $((SLEEP+SLEEP_ADD)) + echo "$OLD_LVF" > $NSDIR/pool/lock_volume_factor + LRU_SIZE_A=`cat $NSDIR/lru_size` + + [ $LRU_SIZE_B -gt $LRU_SIZE_A ] || { + error "No locks dropped in "$((SLEEP+SLEEP_ADD))"s. LRU size: $LRU_SIZE_A" + lru_resize_enable + unlinkmany $DIR/$tdir/f $NR + return + } + + log "Dropped "$((LRU_SIZE_B-LRU_SIZE_A))" locks in "$((SLEEP+SLEEP_ADD))"s" + lru_resize_enable + log "unlink $NR files at $DIR/$tdir" + unlinkmany $DIR/$tdir/f $NR +} +run_test 124a "lru resize =======================================" + +set_lru_size() { + NS=$1 + SIZE=$2 + test "x$NS" = "x" && NS="mdc" + test "x$SIZE" = "x" && SIZE="0" + test $SIZE -lt 0 && SIZE="0" + test $SIZE -gt 0 && ACTION="disabled" || ACTION="enabled" + for d in `find $LPROC/ldlm/namespaces | grep $NS`; do + if test -f $d/lru_size; then + log "$(basename $d):" + log " lru resize $ACTION" + log " lru_size=$SIZE" + echo $SIZE > $d/lru_size + fi + done +} + +get_lru_size() { + NS=$1 + test "x$NS" = "x" && NS="mdc" + for d in `find $LPROC/ldlm/namespaces | grep $NS`; do + if test -f $d/lru_size; then + log "$(basename $d):" + log " lru_size=$(cat $d/lru_size)" + fi + done +} + +test_124b() { + [ -z "`grep lru_resize $LPROC/mdc/*/connect_flags`" ] && \ + skip "no lru resize on server" && return 0 + + NSDIR=`find $LPROC/ldlm/namespaces | grep mdc | head -1` + LIMIT=`cat $NSDIR/pool/limit` + + NR_CPU=$(awk '/processor/' /proc/cpuinfo | wc -l) + # 100 locks here is default value for non-shrinkable lru as well + # as the order to switch to static lru managing policy + # define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) + LDLM_DEFAULT_LRU_SIZE=$((100 * NR_CPU)) + + NR=$((LIMIT-(LIMIT/3))) + log "starting lru resize disable cycle" + set_lru_size "mdc-" $LDLM_DEFAULT_LRU_SIZE + + mkdir -p $DIR/$tdir/disable_lru_resize || + error "failed to create $DIR/$tdir/disable_lru_resize" + + createmany -o $DIR/$tdir/disable_lru_resize/f $NR + log "doing ls -la $DIR/$tdir/disable_lru_resize 3 times" + stime=`date +%s` + ls -la $DIR/$tdir/disable_lru_resize > /dev/null + ls -la $DIR/$tdir/disable_lru_resize > /dev/null + ls -la $DIR/$tdir/disable_lru_resize > /dev/null + etime=`date +%s` + nolruresize_delta=$((etime-stime)) + log "ls -la time: $nolruresize_delta seconds" + get_lru_size "mdc-" + + log "starting lru resize enable cycle" + mkdir -p $DIR/$tdir/enable_lru_resize || + error "failed to create $DIR/$tdir/enable_lru_resize" + + # 0 locks means here flush lru and switch to lru resize policy + set_lru_size "mdc-" 0 + + createmany -o $DIR/$tdir/enable_lru_resize/f $NR + log "doing ls -la $DIR/$tdir/enable_lru_resize 3 times" + stime=`date +%s` + ls -la $DIR/$tdir/enable_lru_resize > /dev/null + ls -la $DIR/$tdir/enable_lru_resize > /dev/null + ls -la $DIR/$tdir/enable_lru_resize > /dev/null + etime=`date +%s` + lruresize_delta=$((etime-stime)) + log "ls -la time: $lruresize_delta seconds" + get_lru_size "mdc-" + + if test $lruresize_delta -gt $nolruresize_delta; then + log "ls -la is $((lruresize_delta - $nolruresize_delta))s slower with lru resize enabled" + elif test $nolruresize_delta -gt $lruresize_delta; then + log "ls -la is $((nolruresize_delta - $lruresize_delta))s faster with lru resize enabled" + else + log "lru resize performs the same with no lru resize" + fi +} +run_test 124b "lru resize (performance test) =======================" + +test_125() { # 13358 + [ -z "$(grep acl $LPROC/mdc/*-mdc-*/connect_flags)" ] && skip "must have acl enabled" && return + mkdir -p $DIR/d125 || error "mkdir failed" + $SETSTRIPE $DIR/d125 65536 -1 -1 || error "setstripe failed" + setfacl -R -m u:bin:rwx $DIR/d125 || error "setfacl $DIR/d125 failed" + ls -ld $DIR/d125 || error "cannot access $DIR/d125" } -run_test 119g "Early Lock Cancel: performance test" +run_test 125 "don't return EPROTO when a dir has a non-default striping and ACLs" + +test_126() { # bug 12829/13455 + [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return + $RUNAS -u 0 -g 1 touch $DIR/$tfile || error "touch failed" + gid=`ls -n $DIR/$tfile | awk '{print $4}'` + rm -f $DIR/$tfile + [ $gid -eq "1" ] || error "gid is set to" $gid "instead of 1" +} +run_test 126 "check that the fsgid provided by the client is taken into account" TMPDIR=$OLDTMPDIR TMP=$OLDTMP HOME=$OLDHOME log "cleanup: ======================================================" -if [ "`mount | grep $MOUNT`" ]; then - rm -rf $DIR/[Rdfs][1-9]* +check_and_cleanup_lustre +if [ "$I_MOUNTED" != "yes" ]; then + sysctl -w lnet.debug="$OLDDEBUG" 2> /dev/null || true fi -if [ "$I_MOUNTED" = "yes" ]; then - cleanupall -f || error "cleanup failed" -fi - echo '=========================== finished ===============================' [ -f "$SANITYLOG" ] && cat $SANITYLOG && grep -q FAIL $SANITYLOG && exit 1 || true