From c2b82ef8a12e6c55f143fbd8986c094425ed667e Mon Sep 17 00:00:00 2001 From: wang di Date: Fri, 31 Jul 2015 04:55:19 -0700 Subject: [PATCH 1/1] LU-3534 tests: a few tests cases for async update. 1 Add update migrate test case in conf-sanity.sh 32c. 2. add replay-dual.sh 26 to failover during tar and dbench. Signed-off-by: wang di Change-Id: I1431bfe8d076a16802d9bba7ca3a7b9d47745f5c Reviewed-on: http://review.whamcloud.com/15163 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Jian Yu Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/tests/conf-sanity.sh | 40 ++++++++++++ lustre/tests/replay-dual.sh | 85 ++++++++++++++++++++++++++ lustre/tests/replay-single.sh | 139 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index b0a19ef..23cd0da 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1646,6 +1646,7 @@ t32_test() { local fstype=$(facet_fstype $SINGLEMDS) local mdt_dev=$tmp/mdt local ost_dev=$tmp/ost + local dir trap 'trap - RETURN; t32_test_cleanup' RETURN @@ -1907,6 +1908,9 @@ t32_test() { $LFS setdirstripe -D -c2 $tmp/mnt/lustre/remote_dir + $r $LCTL set_param -n \ + mdt.${fsname}*.enable_remote_dir=1 2>/dev/null + pushd $tmp/mnt/lustre tar -cf - . --exclude=./remote_dir | tar -xvf - -C remote_dir 1>/dev/null || { @@ -1992,6 +1996,41 @@ t32_test() { echo "list verification skipped" fi + if [ $(lustre_version_code mds1) -ge $(version_code 2.7.50) -a \ + $dne_upgrade != "no" ]; then + $r $LCTL set_param -n \ + mdt.${fsname}*.enable_remote_dir=1 2>/dev/null + + echo "test migration" + pushd $tmp/mnt/lustre + # migrate the files/directories to the remote MDT, then + # move it back + for dir in $(find ! -name .lustre ! -name . -type d); do + mdt_index=$($LFS getdirstripe -i $dir) + stripe_cnt=$($LFS getdirstripe -c $dir) + if [ $mdt_index = 0 -a $stripe_cnt -le 1 ]; then + $LFS mv -M 1 $dir || { + popd + error_noexit "migrate MDT1 failed" + return 1 + } + fi + done + + for dir in $(find ! -name . ! -name .lustre -type d); do + mdt_index=$($LFS getdirstripe -i $dir) + stripe_cnt=$($LFS getdirstripe -c $dir) + if [ $mdt_index = 1 -a $stripe_cnt -le 1 ]; then + $LFS mv -M 0 $dir || { + popd + error_noexit "migrate MDT0 failed" + return 1 + } + fi + done + popd + fi + # # When adding new data verification tests, please check for # the presence of the required reference files first, like @@ -2102,6 +2141,7 @@ test_32c() { # Do not support 1_8 and 2_1 direct upgrade to DNE2 anymore */ echo $tarball | grep "1_8" && continue echo $tarball | grep "2_1" && continue + load_modules dne_upgrade=yes t32_test $tarball writeconf || rc=$? done return $rc diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index fa775ab..e3195ea 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -920,6 +920,91 @@ test_25() { } run_test 25 "replay|resend" +cleanup_26() { + trap 0 + kill -9 $tar_26_pid + kill -9 $dbench_26_jpid +} + +test_26() { + local clients=${CLIENTS:-$HOSTNAME} + + zconf_mount_clients $clients $MOUNT + + local duration=600 + [ "$SLOW" = "no" ] && duration=200 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + local elapsed + local start_ts=$(date +%s) + local rc=0 + + trap cleanup_26 EXIT + ( + local tar_dir=$DIR/$tdir/run_tar + while true; do + test_mkdir -p -c$MDSCOUNT $tar_dir || break + [ $MDSCOUNT -ge 2 ] && + $LFS setdirstripe -D -c$MDSCOUNT $tar_dir || + error "set default dirstripe failed" + cd $tar_dir || break + tar cf - /etc | tar xf - || error "tar failed" + cd $DIR/$tdir || break + rm -rf $tar_dir || break + done + )& + tar_26_pid=$! + echo "Started tar $tar_26_pid" + + ( + local dbench_dir=$DIR2/$tdir/run_dbench + while true; do + test_mkdir -p -c$MDSCOUNT $dbench_dir || break + [ $MDSCOUNT -ge 2 ] && + $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir || + error "set default dirstripe failed" + cd $dbench_dir || break + rundbench 1 -D $dbench_dir -t 100 > /dev/null 2&>1 || + break + cd $DIR/$tdir || break + rm -rf $dbench_dir || break + done + )& + dbench_26_pid=$! + echo "Started dbench $dbench_26_pid" + + elapsed=$(($(date +%s) - start_ts)) + local num_failovers=0 + local fail_index=1 + while [ $elapsed -lt $duration ]; do + ps auxwww | grep -v grep | grep -q $tar_26_pid || + error "tar $tar_26_pid stopped" + ps auxwww | grep -v grep | grep -q $dbench_26_pid || + error "dbench $dbench_26_pid stopped" + sleep 2 + replay_barrier mds$fail_index + sleep 2 # give clients a time to do operations + # Increment the number of failovers + num_failovers=$((num_failovers+1)) + log "$TESTNAME fail mds$fail_index $num_failovers times" + fail mds$fail_index + elapsed=$(($(date +%s) - start_ts)) + if [ $fail_index -ge $MDSCOUNT ]; then + fail_index=1 + else + fail_index=$((fail_index+1)) + fi + done + # stop the client loads + kill -0 $tar_26_pid || error "tar $tar_26_pid stopped" + kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped" + killall -9 dbench + cleanup_26 + true +} +run_test 26 "dbench and tar with mds failover" + complete $SECONDS SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index d5e7404..e4de09b 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -2167,6 +2167,145 @@ test_70c () { } run_test 70c "tar ${MDSCOUNT}mdts recovery" +cleanup_70d() { + trap 0 + kill -9 $mkdir_70d_pid +} + +test_70d () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_70d EXIT + ( + while true; do + $LFS mkdir -i0 -c2 $DIR/$tdir/test || { + echo "mkdir fails" + break + } + $LFS mkdir -i1 -c2 $DIR/$tdir/test1 || { + echo "mkdir fails" + break + } + + touch $DIR/$tdir/test/a || { + echo "touch fails" + break; + } + mkdir $DIR/$tdir/test/b || { + echo "mkdir fails" + break; + } + rm -rf $DIR/$tdir/test || { + echo "rmdir fails" + break + } + + touch $DIR/$tdir/test1/a || { + echo "touch fails" + break; + } + mkdir $DIR/$tdir/test1/b || { + echo "mkdir fails" + break; + } + + rm -rf $DIR/$tdir/test1 || { + echo "rmdir fails" + break + } + done + )& + mkdir_70d_pid=$! + echo "Started $mkdir_70d_pid" + + random_fail_mdt $MDSCOUNT $duration $mkdir_70d_pid + kill -0 $mkdir_70d_pid || error "mkdir/rmdir $mkdir_70d_pid stopped" + + cleanup_70d + true +} +run_test 70d "mkdir/rmdir striped dir ${MDSCOUNT}mdts recovery" + +cleanup_70e() { + trap 0 + kill -9 $rename_70e_pid +} + +test_70e () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + echo ha > /proc/sys/lnet/debug + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + $LFS mkdir -i0 $DIR/$tdir/test_0 + $LFS mkdir -i0 $DIR/$tdir/test_1 + touch $DIR/$tdir/test_0/a + touch $DIR/$tdir/test_1/b + trap cleanup_70e EXIT + ( + while true; do + mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > \ + /dev/null || { + echo "a->b fails" + break; + } + + checkstat $DIR/$tdir/test_0/a && { + echo "a still exists" + break + } + + checkstat $DIR/$tdir/test_1/b || { + echo "b still exists" + break + } + + touch $DIR/$tdir/test_0/a || { + echo "touch a fails" + break + } + + mrename $DIR/$tdir/test_1/b $DIR/$tdir/test_0/a > \ + /dev/null || { + echo "a->a fails" + break; + } + done + )& + rename_70e_pid=$! + echo "Started $rename_70e_pid" + + random_fail_mdt 2 $duration $rename_70e_pid + kill -0 $rename_70e_pid || error "rename $rename_70e_pid stopped" + + cleanup_70e + true +} +run_test 70e "rename cross-MDT with random fails" + + test_73a() { multiop_bg_pause $DIR/$tfile O_tSc || error "multiop_bg_pause $DIR/$tfile failed" -- 1.8.3.1