X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Fsanity.sh;h=cff47e9e9eb01005e6ce44e1b41705974da5a604;hb=72b59b85a253e508ec1b192fbf8cad840ca6ff2c;hp=ec3d67e254c1d17815467f5af2b133c7ddad053f;hpb=c00be06a1f8f27eb5bd8bb47086d0f1e5b5f5f50;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index ec3d67e..cff47e9 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -53,16 +53,16 @@ fi if [[ $(uname -m) = aarch64 ]]; then # bug number: LU-11596 ALWAYS_EXCEPT+=" $GRANT_CHECK_LIST" - # bug number: LU-11671 LU-11594 LU-11667 LU-11729 LU-4398 - ALWAYS_EXCEPT+=" 45 103a 317 810 817" + # bug number: LU-11671 LU-11667 LU-11729 LU-4398 + ALWAYS_EXCEPT+=" 45 317 810 817" fi # 5 12 (min)" [ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 300o" if [ "$mds1_FSTYPE" = "zfs" ]; then - # bug number for skipped test: LU-1957 - ALWAYS_EXCEPT="$ALWAYS_EXCEPT 180" + # bug number for skipped test: + ALWAYS_EXCEPT="$ALWAYS_EXCEPT " # 13 (min)" [ "$SLOW" = "no" ] && EXCEPT_SLOW="$EXCEPT_SLOW 51b" fi @@ -3082,10 +3082,10 @@ test_31n() { run_test 31n "check link count of unlinked file" link_one() { - local TEMPNAME=$(mktemp $1_XXXXXX) - mlink $TEMPNAME $1 2> /dev/null && - echo "$BASHPID: link $TEMPNAME to $1 succeeded" - munlink $TEMPNAME + local tempfile=$(mktemp $1_XXXXXX) + mlink $tempfile $1 2> /dev/null && + echo "$BASHPID: link $tempfile to $1 succeeded" + munlink $tempfile } test_31o() { # LU-2901 @@ -6807,9 +6807,9 @@ test_60a() { local pass=true #get fid and record list - fid_list=($(awk '/9_sub.*record/ { print $NF }' /$TMP/$tfile | + fid_list=($(awk '/9_sub.*record/ { print $NF }' $TMP/$tfile | tail -n 4)) - rec_list=($(awk '/9_sub.*record/ { print $((NF-3)) }' /$TMP/$tfile | + rec_list=($(awk '/9_sub.*record/ { print $((NF-3)) }' $TMP/$tfile | tail -n 4)) #remount mgs as ldiskfs or zfs type stop mgs || error "stop mgs failed" @@ -6928,13 +6928,16 @@ run_test 60e "no space while new llog is being created" test_60g() { local pid + local i test_mkdir -c $MDSCOUNT $DIR/$tdir - $LFS setdirstripe -D -i -1 -c $MDSCOUNT $DIR/$tdir ( local index=0 while true; do + $LFS setdirstripe -i $(($index % $MDSCOUNT)) \ + -c $MDSCOUNT $DIR/$tdir/subdir$index \ + 2>/dev/null mkdir $DIR/$tdir/subdir$index 2>/dev/null rmdir $DIR/$tdir/subdir$index 2>/dev/null index=$((index + 1)) @@ -6943,16 +6946,34 @@ test_60g() { pid=$! - for i in $(seq 100); do + for i in {0..100}; do # define OBD_FAIL_OSD_TXN_START 0x19a - do_facet mds1 lctl set_param fail_loc=0x8000019a + local index=$((i % MDSCOUNT + 1)) + + do_facet mds$index $LCTL set_param fail_loc=0x8000019a \ + > /dev/null usleep 100 done kill -9 $pid + for i in $(seq $MDSCOUNT); do + do_facet mds$i $LCTL set_param fail_loc=0 > /dev/null + done + mkdir $DIR/$tdir/new || error "mkdir failed" rmdir $DIR/$tdir/new || error "rmdir failed" + + do_facet mds1 $LCTL lfsck_start -M $(facet_svc mds1) -A -C \ + -t namespace + for i in $(seq $MDSCOUNT); do + wait_update_facet mds$i "$LCTL get_param -n \ + mdd.$(facet_svc mds$i).lfsck_namespace | + awk '/^status/ { print \\\$2 }'" "completed" + done + + ls -R $DIR/$tdir || error "ls failed" + rm -rf $DIR/$tdir || error "rmdir failed" } run_test 60g "transaction abort won't cause MDT hung" @@ -7747,9 +7768,22 @@ CKSUM_TYPES=${CKSUM_TYPES:-$(lctl get_param -n osc.*osc-[^mM]*.checksum_type | set_checksum_type() { lctl set_param -n osc.*osc-[^mM]*.checksum_type $1 - log "set checksum type to $1" - return 0 + rc=$? + log "set checksum type to $1, rc = $rc" + return $rc +} + +get_osc_checksum_type() +{ + # arugment 1: OST name, like OST0000 + ost=$1 + checksum_type=$(lctl get_param -n osc.*${ost}-osc-[^mM]*.checksum_type | + sed 's/.*\[\(.*\)\].*/\1/g') + rc=$? + [ $rc -ne 0 ] && error "failed to get checksum type of $ost, rc = $rc, output = $checksum_type" + echo $checksum_type } + F77_TMP=$TMP/f77-temp F77SZ=8 setup_f77() { @@ -8001,6 +8035,38 @@ test_77k() { # LU-10906 } run_test 77k "enable/disable checksum correctly" +test_77l() { + [ $PARALLEL == "yes" ] && skip "skip parallel run" + $GSS && skip_env "could not run with gss" + + set_checksums 1 + stack_trap "set_checksums $ORIG_CSUM" EXIT + stack_trap "set_checksum_type $ORIG_CSUM_TYPE" EXIT + + set_checksum_type invalid && error "unexpected success of invalid checksum type" + + $LFS setstripe -c 1 -i 0 $DIR/$tfile + for algo in $CKSUM_TYPES; do + set_checksum_type $algo || error "fail to set checksum type $algo" + osc_algo=$(get_osc_checksum_type OST0000) + [ "$osc_algo" != "$algo" ] && error "checksum type is $osc_algo after setting it to $algo" + + # no locks, no reqs to let the connection idle + cancel_lru_locks osc + lru_resize_disable osc + wait_osc_import_state client ost1 IDLE + + # ensure ost1 is connected + stat $DIR/$tfile >/dev/null || error "can't stat" + wait_osc_import_state client ost1 FULL + + osc_algo=$(get_osc_checksum_type OST0000) + [ "$osc_algo" != "$algo" ] && error "checksum type changed from $algo to $osc_algo after reconnection" + done + return 0 +} +run_test 77l "preferred checksum type is remembered after reconnected" + [ "$ORIG_CSUM" ] && set_checksums $ORIG_CSUM || true rm -f $F77_TMP unset F77_TMP @@ -8399,12 +8465,14 @@ test_101c() { cancel_lru_locks osc $LCTL set_param osc.*.rpc_stats 0 $READS -f $DIR/$tfile -s$FILE_LENGTH -b$rsize -n$nreads -t 180 + $LCTL get_param osc.*.rpc_stats for osc_rpc_stats in $($LCTL get_param -N osc.*.rpc_stats); do local stats=$($LCTL get_param -n $osc_rpc_stats) local lines=$(echo "$stats" | awk 'END {print NR;}') local size if [ $lines -le 20 ]; then + echo "continue debug" continue fi for size in 1 2 4 8; do @@ -11019,6 +11087,75 @@ test_127b() { # bug LU-333 } run_test 127b "verify the llite client stats are sane" +test_127c() { # LU-12394 + [ "$OSTCOUNT" -lt "2" ] && skip_env "needs >= 2 OSTs" + local size + local bsize + local reads + local writes + local count + + $LCTL set_param llite.*.extents_stats=1 + stack_trap "$LCTL set_param llite.*.extents_stats=0" EXIT + + # Use two stripes so there is enough space in default config + $LFS setstripe -c 2 $DIR/$tfile + + # Extent stats start at 0-4K and go in power of two buckets + # LL_HIST_START = 12 --> 2^12 = 4K + # We do 3K*2^i, so 3K, 6K, 12K, 24K... hitting each bucket. + # We do not do buckets larger than 64 MiB to avoid ENOSPC issues on + # small configs + for size in 3K 6K 12K 24K 48K 96K 192K 384K 768K 1536K 3M 6M 12M 24M 48M; + do + # Write and read, 2x each, second time at a non-zero offset + dd if=/dev/zero of=$DIR/$tfile bs=$size count=1 + dd if=/dev/zero of=$DIR/$tfile bs=$size count=1 seek=10 + dd if=$DIR/$tfile of=/dev/null bs=$size count=1 + dd if=$DIR/$tfile of=/dev/null bs=$size count=1 seek=10 + rm -f $DIR/$tfile + done + + $LCTL get_param llite.*.extents_stats + + count=2 + for bsize in 4K 8K 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M 16M 32M 64M; + do + local bucket=$($LCTL get_param -n llite.*.extents_stats | + grep -m 1 $bsize) + reads=$(echo $bucket | awk '{print $5}') + writes=$(echo $bucket | awk '{print $9}') + [ "$reads" -eq $count ] || + error "$reads reads in < $bsize bucket, expect $count" + [ "$writes" -eq $count ] || + error "$writes writes in < $bsize bucket, expect $count" + done + + # Test mmap write and read + $LCTL set_param llite.*.extents_stats=c + size=512 + dd if=/dev/zero of=$DIR/$tfile bs=${size}K count=1 + $MULTIOP $DIR/$tfile OSMRUc || error "$MULTIOP $DIR/$tfile failed" + $MULTIOP $DIR/$tfile OSMWUc || error "$MULTIOP $DIR/$tfile failed" + + $LCTL get_param llite.*.extents_stats + + count=$(((size*1024) / PAGE_SIZE)) + + bsize=$((2 * PAGE_SIZE / 1024))K + + bucket=$($LCTL get_param -n llite.*.extents_stats | + grep -m 1 $bsize) + reads=$(echo $bucket | awk '{print $5}') + writes=$(echo $bucket | awk '{print $9}') + # mmap writes fault in the page first, creating an additonal read + [ "$reads" -eq $((2 * count)) ] || + error "$reads reads in < $bsize bucket, expect $count" + [ "$writes" -eq $count ] || + error "$writes writes in < $bsize bucket, expect $count" +} +run_test 127c "test llite extent stats with regular & mmap i/o" + test_128() { # bug 15212 touch $DIR/$tfile $LFS 2>&1 <<-EOF | tee $TMP/$tfile.log @@ -13634,6 +13771,49 @@ test_160i() { } run_test 160i "changelog user register/unregister race" +test_160j() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ $MDS1_VERSION -lt $(version_code 2.12.56) ]] && + skip "Need MDS version at least 2.12.56" + + mount_client $MOUNT2 || error "mount_client on $MOUNT2 failed" + + changelog_register || error "first changelog_register failed" + + # generate some changelog + test_mkdir -c $MDSCOUNT $DIR/$tdir || error "mkdir $tdir failed" + createmany -m $DIR/$tdir/${tfile}bis $((MDSCOUNT * 2)) || + error "create $DIR/$tdir/${tfile}bis failed" + + # open the changelog device + exec 3>/dev/changelog-$FSNAME-MDT0000 + exec 4/dev/null || error "read changelog failed" + + # clear changelog + local cl_user="${CL_USERS[$SINGLEMDS]%% *}" + changelog_users $SINGLEMDS | grep -q $cl_user || + error "User $cl_user not found in changelog_users" + + printf 'clear:'$cl_user':0' >&3 + + # close + exec 3>&- + exec 4<&- + + # cleanup + changelog_deregister || error "changelog_deregister failed" + + umount $MOUNT2 + mount_client $MOUNT || error "mount_client on $MOUNT failed" +} +run_test 160j "client can be umounted while its chanangelog is being used" + test_161a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" @@ -13830,7 +14010,8 @@ test_161d() { ps -p $pid [[ $? -eq 0 ]] || error "create should be blocked" - local tempfile=$(mktemp) + local tempfile="$(mktemp --tmpdir $tfile.XXXXXX)" + stack_trap "rm -f $tempfile" fid=$(changelog_extract_field "CREAT" "$tfile" "t=") cat $MOUNT/.lustre/fid/$fid 2>/dev/null >$tempfile || error "cat failed" # some delay may occur during ChangeLog publishing and file read just @@ -16703,7 +16884,7 @@ test_243() } run_test 243 "various group lock tests" -test_244() +test_244a() { test_mkdir $DIR/$tdir dd if=/dev/zero of=$DIR/$tdir/$tfile bs=1M count=35 @@ -16711,7 +16892,26 @@ test_244() error "sendfile+grouplock failed" rm -rf $DIR/$tdir } -run_test 244 "sendfile with group lock tests" +run_test 244a "sendfile with group lock tests" + +test_244b() +{ + [ $PARALLEL == "yes" ] && skip "skip parallel run" && return + + local threads=50 + local size=$((1024*1024)) + + test_mkdir $DIR/$tdir + for i in $(seq 1 $threads); do + local file=$DIR/$tdir/file_$((i / 10)) + $MULTIOP $file OG1234w$size_$((i % 3))w$size_$((i % 4))g1234c & + local pids[$i]=$! + done + for i in $(seq 1 $threads); do + wait ${pids[$i]} + done +} +run_test 244b "multi-threaded write with group lock" test_245() { local flagname="multi_mod_rpcs" @@ -17483,7 +17683,8 @@ test_256() { #after mount new plainllog is used touch $DIR/$tdir/{11..19} - local tmpfile=$(mktemp -u $tfile.XXXXXX) + local tmpfile="$(mktemp --tmpdir -u $tfile.XXXXXX)" + stack_trap "rm -f $tmpfile" cat_sl=$(do_facet $SINGLEMDS "sync; \ $DEBUGFS -c -R 'dump changelog_catalog $tmpfile' $mdt_dev; \ llog_reader $tmpfile | grep -c type=1064553b") @@ -17495,7 +17696,7 @@ test_256() { cat_sl=$(do_facet $SINGLEMDS "sync; \ $DEBUGFS -c -R 'dump changelog_catalog $tmpfile' $mdt_dev; \ - llog_reader $tmpfile | grep -c type=1064553b; rm -f $tmpfile") + llog_reader $tmpfile | grep -c type=1064553b") if (( cat_sl == 2 )); then error "Empty plain llog was not deleted from changelog catalog" @@ -18119,8 +18320,8 @@ test_271f() { local mdtidx=$($LFS getstripe --mdt-index $DIR/$tdir) cancel_lru_locks mdc - dd if=/dev/urandom of=$tmp bs=200000 count=1 - dd if=$tmp of=$dom bs=200000 count=1 + dd if=/dev/urandom of=$tmp bs=265000 count=1 + dd if=$tmp of=$dom bs=265000 count=1 cancel_lru_locks mdc cat /etc/hosts >> $tmp lctl set_param -n mdc.*.stats=clear @@ -18147,6 +18348,7 @@ test_271f() { local ra=$(get_mdc_stats $mdtidx req_active) local rw=$(get_mdc_stats $mdtidx req_waittime) + [ -z $num ] && num=0 [ $num -eq 1 ] || error "expect 1 READ RPC, $num occured" [ $ra == $rw ] || error "$((ra - rw)) resend occured" echo "... DONE" @@ -18226,12 +18428,12 @@ test_272b() { $LFS migrate -c2 $dom || error "failed to migrate to the new composite layout" - [ $($LFS getstripe -L $dom) == 'mdt' ] && + [ $($LFS getstripe -L $dom) != 'mdt' ] || error "MDT stripe was not removed" cancel_lru_locks mdc local new_md5=$(md5sum $dom) - [ "$old_md5" != "$new_md5" ] && + [ "$old_md5" == "$new_md5" ] || error "$old_md5 != $new_md5" # Skip free space checks with ZFS @@ -18271,7 +18473,7 @@ test_272c() { cancel_lru_locks mdc local new_md5=$(md5sum $dom) - [ "$old_md5" != "$new_md5" ] && + [ "$old_md5" == "$new_md5" ] || error "$old_md5 != $new_md5" # Skip free space checks with ZFS @@ -18285,6 +18487,108 @@ test_272c() { } run_test 272c "DoM migration: DOM file to the OST-striped file (composite)" +test_272d() { + [ $MDS1_VERSION -lt $(version_code 2.12.55) ] && + skip "Need MDS version at least 2.12.55" + + local dom=$DIR/$tdir/$tfile + mkdir -p $DIR/$tdir + $LFS setstripe -E 1M -L mdt -E -1 -c1 $dom + + local mdtidx=$($LFS getstripe -m $dom) + local mdtname=MDT$(printf %04x $mdtidx) + local facet=mds$((mdtidx + 1)) + + dd if=/dev/urandom of=$dom bs=2M count=1 oflag=direct || + error "failed to write data into $dom" + local old_md5=$(md5sum $dom) + cancel_lru_locks mdc + local mdtfree1=$(do_facet $facet \ + lctl get_param -n osd*.*$mdtname.kbytesfree) + + $LFS mirror extend -N -E 2M -c1 -E -1 -c2 $dom || + error "failed mirroring to the new composite layout" + $LFS mirror resync $dom || + error "failed mirror resync" + $LFS mirror split --mirror-id 1 -d $dom || + error "failed mirror split" + + [ $($LFS getstripe -L $dom) != 'mdt' ] || + error "MDT stripe was not removed" + + cancel_lru_locks mdc + local new_md5=$(md5sum $dom) + [ "$old_md5" == "$new_md5" ] || + error "$old_md5 != $new_md5" + + # Skip free space checks with ZFS + if [ "$(facet_fstype $facet)" != "zfs" ]; then + local mdtfree2=$(do_facet $facet \ + lctl get_param -n osd*.*$mdtname.kbytesfree) + [ $mdtfree2 -gt $mdtfree1 ] || + error "MDS space is not freed after DOM mirror deletion" + fi + return 0 +} +run_test 272d "DoM mirroring: OST-striped mirror to DOM file" + +test_272e() { + [ $MDS1_VERSION -lt $(version_code 2.12.55) ] && + skip "Need MDS version at least 2.12.55" + + local dom=$DIR/$tdir/$tfile + mkdir -p $DIR/$tdir + $LFS setstripe -c 2 $dom + + dd if=/dev/urandom of=$dom bs=512K count=1 oflag=direct || + error "failed to write data into $dom" + local old_md5=$(md5sum $dom) + cancel_lru_locks mdc + + $LFS mirror extend -N -E 1M -L mdt -E eof -c2 $dom || + error "failed mirroring to the DOM layout" + $LFS mirror resync $dom || + error "failed mirror resync" + $LFS mirror split --mirror-id 1 -d $dom || + error "failed mirror split" + + [ $($LFS getstripe -L $dom) != 'mdt' ] || + error "MDT stripe was not removed" + + cancel_lru_locks mdc + local new_md5=$(md5sum $dom) + [ "$old_md5" == "$new_md5" ] || + error "$old_md5 != $new_md5" + + return 0 +} +run_test 272e "DoM mirroring: DOM mirror to the OST-striped file" + +test_272f() { + [ $MDS1_VERSION -lt $(version_code 2.12.55) ] && + skip "Need MDS version at least 2.12.55" + + local dom=$DIR/$tdir/$tfile + mkdir -p $DIR/$tdir + $LFS setstripe -c 2 $dom + + dd if=/dev/urandom of=$dom bs=512K count=1 oflag=direct || + error "failed to write data into $dom" + local old_md5=$(md5sum $dom) + cancel_lru_locks mdc + + $LFS migrate -E 1M -L mdt -E eof -c2 -v $dom || + error "failed migrating to the DOM file" + + cancel_lru_locks mdc + local new_md5=$(md5sum $dom) + [ "$old_md5" != "$new_md5" ] && + error "$old_md5 != $new_md5" + + return 0 +} +run_test 272f "DoM migration: OST-striped file to DOM file" + test_273a() { [ $MDS1_VERSION -lt $(version_code 2.11.50) ] && skip "Need MDS version at least 2.11.50" @@ -18377,6 +18681,33 @@ test_277() { } run_test 277 "Direct IO shall drop page cache" +test_278() { + [ $PARALLEL == "yes" ] && skip "skip parallel run" && return + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + [[ "$(facet_host mds1)" != "$(facet_host mds2)" ]] && + skip "needs the same host for mdt1 mdt2" && return + + local pid1 + local pid2 + +#define OBD_FAIL_OBD_STOP_MDS_RACE 0x60b + do_facet mds2 $LCTL set_param fail_loc=0x8000060c + stop mds2 & + pid2=$! + + stop mds1 + + echo "Starting MDTs" + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS + wait $pid2 +#For the error assertion will happen. lu_env_get_key(..., &mdt_thread_key) +#will return NULL + do_facet mds2 $LCTL set_param fail_loc=0 + + start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS +} +run_test 278 "Race starting MDS between MDTs stop/start" + cleanup_test_300() { trap 0 umask $SAVE_UMASK @@ -20735,7 +21066,8 @@ test_801a() { echo "Start barrier_freeze at: $(date)" #define OBD_FAIL_BARRIER_DELAY 0x2202 do_facet mgs $LCTL set_param fail_val=5 fail_loc=0x2202 - do_facet mgs $LCTL barrier_freeze $FSNAME 10 & + # Do not reduce barrier time - See LU-11873 + do_facet mgs $LCTL barrier_freeze $FSNAME 20 & sleep 2 local b_status=$(barrier_stat) @@ -20757,7 +21089,8 @@ test_801a() { [ "$b_status" = "'expired'" ] || error "(3) unexpected barrier status $b_status" - do_facet mgs $LCTL barrier_freeze $FSNAME 10 || + # Do not reduce barrier time - See LU-11873 + do_facet mgs $LCTL barrier_freeze $FSNAME 20 || error "(4) fail to freeze barrier" b_status=$(barrier_stat) @@ -20882,7 +21215,8 @@ test_801c() { do_facet mgs $LCTL barrier_rescan $FSNAME || error "(3) Fail to rescan barrier bitmap" - do_facet mgs $LCTL barrier_freeze $FSNAME 10 + # Do not reduce barrier time - See LU-11873 + do_facet mgs $LCTL barrier_freeze $FSNAME 20 b_status=$(barrier_stat) [ "$b_status" = "'frozen'" ] || @@ -21657,6 +21991,9 @@ test_815() run_test 815 "zero byte tiny write doesn't hang (LU-12382)" test_816() { + [ "$SHARED_KEY" = true ] && + skip "OSC connections never go IDLE with Shared-Keys enabled" + $LFS setstripe -c 1 -i 0 $DIR/$tfile # ensure ost1 is connected stat $DIR/$tfile >/dev/null || error "can't stat" @@ -21701,6 +22038,19 @@ test_817() { } run_test 817 "nfsd won't cache write lock for exec file" +test_818() { + mkdir $DIR/$tdir + $LFS setstripe -c1 -i0 $DIR/$tfile + $LFS setstripe -c1 -i1 $DIR/$tfile + stop $SINGLEMDS + #define OBD_FAIL_OSP_CANT_PROCESS_LLOG 0x2105 + do_facet $SINGLEMDS lctl set_param fail_loc=0x80002105 + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS || + error "start $SINGLEMDS failed" + rm -rf $DIR/$tdir +} +run_test 818 "unlink with failed llog" + # # tests that do cleanup/setup should be run at the end #