X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;ds=sidebyside;f=lustre%2Ftests%2Fsanity.sh;h=8ac81384a436e711f515bcf816c50a5918d76d6e;hb=e5346a494fcb54b7f9fbc7ed4fb93003a8489231;hp=ab86f14c727b6bc109dc3b99d0c0e8a91771a53a;hpb=02c23a2e851fdebc3e2bde45a51fb043559504ab;p=fs%2Flustre-release.git diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh old mode 100644 new mode 100755 index ab86f14..8ac81384 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -64,13 +64,19 @@ if [[ $(uname -m) = aarch64 ]]; then fi # skip nfs tests on kernels >= 4.14.0 until they are fixed -if [ $LINUX_VERSION_CODE -ge $(version_code 4.14.0) ];then +if [ $LINUX_VERSION_CODE -ge $(version_code 4.14.0) ]; then # bug number: LU-12661 ALWAYS_EXCEPT+=" 817" fi +# skip cgroup tests on RHEL8.1 kernels until they are fixed +if (( $LINUX_VERSION_CODE >= $(version_code 4.18.0) && + $LINUX_VERSION_CODE < $(version_code 5.4.0) )); then + # bug number: LU-13063 + ALWAYS_EXCEPT+=" 411" +fi # 5 12 (min)" -[ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 300o" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="27m 64b 68 71 115 135 136 300o" if [ "$mds1_FSTYPE" = "zfs" ]; then # bug number for skipped test: @@ -550,11 +556,6 @@ test_17g() { [ $MDS1_VERSION -le $(version_code 2.3.55) ] && TESTS="4094 4095" - # skip long symlink name for rhel6.5. - # rhel6.5 has a limit (PATH_MAX - sizeof(struct filename)) - grep -q '6.5' /etc/redhat-release &>/dev/null && - TESTS="59 60 61 4062 4063" - for i in $TESTS; do local SYMNAME=$(str_repeat 'x' $i) ln -s $SYMNAME $DIR/$tdir/f$i || error "failed $i-char symlink" @@ -2641,7 +2642,7 @@ test_27I() { run_test 27I "check that root dir striping does not break parent dir one" test_27J() { - [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.51) ]] && + [[ $MDS1_VERSION -le $(version_code 2.12.51) ]] && skip "Need MDS version newer than 2.12.51" test_mkdir $DIR/$tdir @@ -2736,7 +2737,7 @@ test_27J() { run_test 27J "basic ops on file with foreign LOV" test_27K() { - [[ $(lustre_version_code $SINGLEMDS) -le $(version_code 2.12.49) ]] && + [[ $MDS1_VERSION -le $(version_code 2.12.49) ]] && skip "Need MDS version newer than 2.12.49" test_mkdir $DIR/$tdir @@ -7838,7 +7839,7 @@ run_test 65m "normal user can't set filesystem default stripe" test_65n() { [ -n "$FILESET" ] && skip "Not functional for FILESET set" - [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.12.50) ]] || + [[ $MDS1_VERSION -ge $(version_code 2.12.50) ]] || skip "Need MDS version at least 2.12.50" [[ $PARALLEL != "yes" ]] || skip "skip parallel run" @@ -11162,6 +11163,27 @@ test_123b () { # statahead(bug 15027) } run_test 123b "not panic with network error in statahead enqueue (bug 15027)" +test_123c() { + [[ $MDSCOUNT -lt 2 ]] && skip_env "needs >= 2 MDTs" + + test_mkdir -i 0 -c 1 $DIR/$tdir.0 + test_mkdir -i 1 -c 1 $DIR/$tdir.1 + touch $DIR/$tdir.1/{1..3} + mv $DIR/$tdir.1/{1..3} $DIR/$tdir.0 + + remount_client $MOUNT + + $MULTIOP $DIR/$tdir.0 Q + + # let statahead to complete + ls -l $DIR/$tdir.0 > /dev/null + + testid=$(echo $TESTNAME | tr '_' ' ') + dmesg | tac | sed "/$testid/,$ d" | grep "Can not initialize inode" && + error "statahead warning" || true +} +run_test 123c "Can not initialize inode warning on DNE statahead" + test_124a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" $LCTL get_param -n mdc.*.connect_flags | grep -q lru_resize || @@ -12472,33 +12494,22 @@ test_133g() { remote_mds_nodsh && skip "remote MDS with nodsh" remote_ost_nodsh && skip "remote OST with nodsh" - # eventually, this can also be replaced with "lctl get_param -R", - # but not until that option is always available on the server local facet for facet in mds1 ost1; do - [ $(lustre_version_code $facet) -le $(version_code 2.5.54) ] && - skip_noexit "Too old lustre on $facet" - local facet_proc_dirs=$(do_facet $facet \ - \\\ls -d $proc_regexp 2>/dev/null) - echo "${facet}_proc_dirs='$facet_proc_dirs'" - [ -z "$facet_proc_dirs" ] && error "no proc_dirs on $facet" - do_facet $facet find $facet_proc_dirs \ - ! -name req_history \ - -exec cat '{}' \\\; &> /dev/null - - do_facet $facet find $facet_proc_dirs \ - ! -name req_history \ - -type f \ - -exec cat '{}' \\\; &> /dev/null || - error "proc file read failed" - - do_facet $facet find $facet_proc_dirs \ - -ignore_readdir_race \ - -type f \ - -not -name force_lbug \ - -not -name changelog_mask \ - -exec badarea_io '{}' \\\; || - error_133 "$facet find $facet_proc_dirs failed" + local facet_ver=$(lustre_version_code $facet) + if [ $facet_ver -ge $(version_code 2.7.65) ]; then + do_facet $facet "$LCTL get_param -R '*'" &> /dev/null + else + log "$facet: too old lustre for get_param -R" + fi + if [ $facet_ver -ge $(version_code 2.5.54) ]; then + do_facet $facet "$LCTL list_param -R '*' | grep '=' | + tr -d= | egrep -v 'force_lbug|changelog_mask' | + xargs badarea_io" || + error_133 "$facet badarea_io failed" + else + skip_noexit "$facet: too old lustre for get_param -R" + fi done # remount the FS in case writes/reads /proc break the FS @@ -12614,6 +12625,73 @@ test_134b() { } run_test 134b "Server rejects lock request when reaching lock_limit_mb" +test_135() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ $MDS1_VERSION -lt $(version_code 2.13.50) ]] && + skip "Need MDS version at least 2.13.50" + local fname + + mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + +#define OBD_FAIL_PLAIN_RECORDS 0x1319 + #set only one record at plain llog + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x1319 fail_val=1 + + #fill already existed plain llog each 64767 + #wrapping whole catalog + createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1)) + + createmany -o $DIR/$tdir/$tfile_ 64700 + for (( i = 0; i < 64700; i = i + 2 )) + do + rm $DIR/$tdir/$tfile_$i & + rm $DIR/$tdir/$tfile_$((i + 1)) & + local pid=$! + wait $pid + done + + #waiting osp synchronization + wait_delete_completed +} +run_test 135 "Race catalog processing" + +test_136() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [[ $MDS1_VERSION -lt $(version_code 2.13.50) ]] && + skip "Need MDS version at least 2.13.50" + local fname + + mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" + $SETSTRIPE -c 1 -i 0 $DIR/$tdir || error "failed to set striping" + #set only one record at plain llog +#define OBD_FAIL_CATALOG_FULL_CHECK 0x131a + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x131a fail_val=1 + + #fill already existed 2 plain llogs each 64767 + #wrapping whole catalog + createmany -o -u $DIR/$tdir/$tfile- $((64767 * 1)) + createmany -o -u $DIR/$tdir/$tfile- $((64767 * 3 / 2)) + wait_delete_completed + + createmany -o $DIR/$tdir/$tfile_ 10 + sleep 25 + + do_facet $SINGLEMDS $LCTL set_param fail_val=3 + for (( i = 0; i < 10; i = i + 3 )) + do + rm $DIR/$tdir/$tfile_$i & + rm $DIR/$tdir/$tfile_$((i + 1)) & + local pid=$! + wait $pid + sleep 7 + rm $DIR/$tdir/$tfile_$((i + 2)) & + done + + #waiting osp synchronization + wait_delete_completed +} +run_test 136 "Race catalog processing 2" + test_140() { #bug-17379 [ $PARALLEL == "yes" ] && skip "skip parallel run" @@ -13798,14 +13876,17 @@ test_160f() { # generate some changelog records to accumulate on each MDT test_mkdir -c $MDSCOUNT $DIR/$tdir || error "test_mkdir $tdir failed" + log "$(date +%s): creating first files" createmany -m $DIR/$tdir/$tfile $((MDSCOUNT * 2)) || error "create $DIR/$tdir/$tfile failed" # check changelogs have been generated + local start=$SECONDS + local idle_time=$((MDSCOUNT * 5 + 5)) local nbcl=$(changelog_dump | wc -l) [[ $nbcl -eq 0 ]] && error "no changelogs found" - for param in "changelog_max_idle_time=10" \ + for param in "changelog_max_idle_time=$idle_time" \ "changelog_gc=1" \ "changelog_min_gc_interval=2" \ "changelog_min_free_cat_entries=3"; do @@ -13817,8 +13898,11 @@ test_160f() { do_nodes $mdts $LCTL set_param mdd.*.$param done - # force cl_user2 to be idle (1st part) - sleep 9 + # force cl_user2 to be idle (1st part), but also cancel the + # cl_user1 records so that it is not evicted later in the test. + local sleep1=$((idle_time / 2)) + echo "$(date +%s): sleep1 $sleep1/${idle_time}s" + sleep $sleep1 # simulate changelog catalog almost full #define OBD_FAIL_CAT_FREE_RECORDS 0x1313 @@ -13854,13 +13938,16 @@ test_160f() { "$user_rec1, but is $user_rec2" done - # force cl_user2 to be idle (2nd part) and to reach - # changelog_max_idle_time - sleep 2 + # force cl_user2 idle (2nd part) to just exceed changelog_max_idle_time + local sleep2=$((idle_time - (SECONDS - start) + 1)) + echo "$(date +%s): sleep2 $sleep2/${idle_time}s" + sleep $sleep2 - # generate one more changelog to trigger fail_loc - createmany -m $DIR/$tdir/${tfile}bis $((MDSCOUNT * 2)) || - error "create $DIR/$tdir/${tfile}bis failed" + # Generate one more changelog to trigger GC at fail_loc for cl_user2. + # cl_user1 should be OK because it recently processed records. + echo "$(date +%s): creating $((MDSCOUNT * 2)) files" + createmany -m $DIR/$tdir/${tfile}b $((MDSCOUNT * 2)) || + error "create $DIR/$tdir/${tfile}b failed" # ensure gc thread is done for i in $(mdts_nodes); do @@ -15463,7 +15550,7 @@ jobstats_set() { "$FSNAME.sys.jobid_var" $new_jobenv } -test_205() { # Job stats +test_205a() { # Job stats [ $PARALLEL == "yes" ] && skip "skip parallel run" [[ $MDS1_VERSION -ge $(version_code 2.7.1) ]] || skip "Need MDS version with at least 2.7.1" @@ -15565,7 +15652,18 @@ test_205() { # Job stats verify_jobstats "touch $DIR/$tfile" $SINGLEMDS } -run_test 205 "Verify job stats" +run_test 205a "Verify job stats" + +# LU-13117 +test_205b() { + $LCTL set_param jobid_var=USER jobid_name="%e.%u" + env -i USERTESTJOBSTATS=foolish touch $DIR/$tfile.1 + do_facet $SINGLEMDS $LCTL get_param mdt.*.job_stats | + grep job_id: | grep foolish && + error "Unexpected jobid found" + true +} +run_test 205b "Verify job stats jobid parsing" # LU-1480, LU-1773 and LU-1657 test_206() { @@ -19988,7 +20086,7 @@ test_300q() { run_test 300q "create remote directory under orphan directory" test_300r() { - [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] && + [ $MDS1_VERSION -lt $(version_code 2.7.55) ] && skip "Need MDS version at least 2.7.55" && return [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return @@ -20446,6 +20544,105 @@ test_319() { } run_test 319 "lost lease lock on migrate error" +test_398a() { # LU-4198 + $LFS setstripe -c 1 -i 0 $DIR/$tfile + $LCTL set_param ldlm.namespaces.*.lru_size=clear + + # request a new lock on client + dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 + + dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 oflag=direct conv=notrunc + local lock_count=$($LCTL get_param -n \ + ldlm.namespaces.*-OST0000-osc-ffff*.lru_size) + [[ $lock_count -eq 0 ]] || error "lock should be cancelled by direct IO" + + $LCTL set_param ldlm.namespaces.*-OST0000-osc-ffff*.lru_size=clear + + # no lock cached, should use lockless IO and not enqueue new lock + dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 oflag=direct conv=notrunc + lock_count=$($LCTL get_param -n \ + ldlm.namespaces.*-OST0000-osc-ffff*.lru_size) + [[ $lock_count -eq 0 ]] || error "no lock should be held by direct IO" +} +run_test 398a "direct IO should cancel lock otherwise lockless" + +test_398b() { # LU-4198 + which fio || skip_env "no fio installed" + $LFS setstripe -c -1 $DIR/$tfile + + local size=12 + dd if=/dev/zero of=$DIR/$tfile bs=1M count=$size + + local njobs=4 + echo "mix direct rw ${size}M to OST0 by fio with $njobs jobs..." + fio --name=rand-rw --rw=randrw --bs=$PAGE_SIZE --direct=1 \ + --numjobs=$njobs --fallocate=none \ + --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \ + --filename=$DIR/$tfile & + bg_pid=$! + + echo "mix buffer rw ${size}M to OST0 by fio with $njobs jobs..." + fio --name=rand-rw --rw=randrw --bs=$PAGE_SIZE \ + --numjobs=$njobs --fallocate=none \ + --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \ + --filename=$DIR/$tfile || true + wait $bg_pid + + rm -rf $DIR/$tfile +} +run_test 398b "DIO and buffer IO race" + +test_398c() { # LU-4198 + which fio || skip_env "no fio installed" + + saved_debug=$($LCTL get_param -n debug) + $LCTL set_param debug=0 + + local size=$(lctl get_param -n osc.$FSNAME-OST0000*.kbytesavail | head -1) + ((size /= 1024)) # by megabytes + ((size /= 2)) # write half of the OST at most + [ $size -gt 40 ] && size=40 #reduce test time anyway + + $LFS setstripe -c 1 $DIR/$tfile + + # it seems like ldiskfs reserves more space than necessary if the + # writing blocks are not mapped, so it extends the file firstly + dd if=/dev/zero of=$DIR/$tfile bs=1M count=$size && sync + cancel_lru_locks osc + + # clear and verify rpc_stats later + $LCTL set_param osc.${FSNAME}-OST0000-osc-ffff*.rpc_stats=clear + + local njobs=4 + echo "writing ${size}M to OST0 by fio with $njobs jobs..." + fio --name=rand-write --rw=randwrite --bs=$PAGE_SIZE --direct=1 \ + --numjobs=$njobs --fallocate=none --ioengine=libaio \ + --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \ + --filename=$DIR/$tfile + [ $? -eq 0 ] || error "fio write error" + + [ $($LCTL get_param -n \ + ldlm.namespaces.${FSNAME}-OST0000-osc-ffff*.lock_count) -eq 0 ] || + error "Locks were requested while doing AIO" + + # get the percentage of 1-page I/O + pct=$($LCTL get_param osc.${FSNAME}-OST0000-osc-ffff*.rpc_stats | + grep -A 1 'pages per rpc' | grep -v 'pages per rpc' | + awk '{print $7}') + [ $pct -le 50 ] || error "$pct% of I/O are 1-page" + + echo "mix rw ${size}M to OST0 by fio with $njobs jobs..." + fio --name=rand-rw --rw=randrw --bs=$PAGE_SIZE --direct=1 \ + --numjobs=$njobs --fallocate=none --ioengine=libaio \ + --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \ + --filename=$DIR/$tfile + [ $? -eq 0 ] || error "fio mixed read write error" + + rm -rf $DIR/$tfile + $LCTL set_param debug="$saved_debug" +} +run_test 398c "run fio to test AIO" + test_fake_rw() { local read_write=$1 if [ "$read_write" = "write" ]; then @@ -20959,7 +21156,7 @@ run_test 411 "Slab allocation error with cgroup does not LBUG" test_412() { [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs" - if [ $(lustre_version_code mds1) -lt $(version_code 2.10.55) ]; then + if [ $MDS1_VERSION -lt $(version_code 2.10.55) ]; then skip "Need server version at least 2.10.55" fi @@ -21227,7 +21424,7 @@ run_test 414 "simulate ENOMEM in ptlrpc_register_bulk()" test_415() { [ $PARALLEL == "yes" ] && skip "skip parallel run" - [ $(lustre_version_code mds1) -lt $(version_code 2.11.52) ] && + [ $MDS1_VERSION -lt $(version_code 2.11.52) ] && skip "Need server version at least 2.11.52" # LU-11102 @@ -21269,7 +21466,7 @@ test_415() { run_test 415 "lock revoke is not missing" test_416() { - [ $(lustre_version_code mds1) -lt $(version_code 2.11.55) ] && + [ $MDS1_VERSION -lt $(version_code 2.11.55) ] && skip "Need server version at least 2.11.55" # define OBD_FAIL_OSD_TXN_START 0x19a @@ -21724,7 +21921,7 @@ test_422() { run_test 422 "kill a process with RPC in progress" prep_801() { - [[ $(lustre_version_code mds1) -lt $(version_code 2.9.55) ]] || + [[ $MDS1_VERSION -lt $(version_code 2.9.55) ]] || [[ $OST1_VERSION -lt $(version_code 2.9.55) ]] && skip "Need server version at least 2.9.55" @@ -21955,7 +22152,7 @@ cleanup_802a() { test_802a() { [[ $mds1_FSTYPE = zfs ]] || skip "ZFS specific test" - [[ $(lustre_version_code mds1) -lt $(version_code 2.9.55) ]] || + [[ $MDS1_VERSION -lt $(version_code 2.9.55) ]] || [[ $OST1_VERSION -lt $(version_code 2.9.55) ]] && skip "Need server version at least 2.9.55" @@ -22474,7 +22671,7 @@ test_810() { run_test 810 "partial page writes on ZFS (LU-11663)" test_811() { - [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.11.56) ] && + [ $MDS1_VERSION -lt $(version_code 2.11.56) ] && skip "Need MDS version at least 2.11.56" #define OBD_FAIL_MDS_ORPHAN_DELETE 0x165 @@ -22484,8 +22681,7 @@ test_811() { stop mds1 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS - sleep 5 - [[ $(do_facet mds1 pgrep orph_.*-MDD | wc -l) -eq 0 ]] || + wait_update_facet mds1 "pgrep orph_.*-MDD | wc -l" "0" || error "MDD orphan cleanup thread not quit" } run_test 811 "orphan name stub can be cleaned up in startup"