X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Freplay-single.sh;h=b8b77314d56c88a7b91c6f409d3d4b9fa94e63a9;hp=31586b2a7798b46b850e01536295edb1c48a5cdf;hb=f44fe5abbc74ca79790c100a30193ded1ef1e6c9;hpb=b9e1bb635039c6d2d985754a9a029c9d5c20b569 diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 31586b2..b8b7731 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -26,12 +26,6 @@ require_dsh_mds || exit 0 ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT 61d" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -case "$(lsb_release -sr)" in # only disable tests for el7 -7*) # bug number: LU-6455----- - ALWAYS_EXCEPT="$ALWAYS_EXCEPT 28" - ;; -esac - # 7.5 (min)" [ "$SLOW" = "no" ] && EXCEPT_SLOW="44b" @@ -873,7 +867,7 @@ test_40(){ lctl get_param mdc.*.connect_flags | grep -q layout_lock && skip "layout_lock needs MDS connection for IO" && return 0 - $LCTL mark "$HOSTNAME multiop $MOUNT/$tfile OS_c" + $LCTL mark multiop $MOUNT/$tfile OS_c multiop $MOUNT/$tfile OS_c & PID=$! writeme -s $MOUNT/${tfile}-2 & @@ -2036,10 +2030,6 @@ check_for_process () { test_70b () { local clients=${CLIENTS:-$HOSTNAME} - local mdscount=$MDSCOUNT - - # until LU-6844 is fixed, run on one MDT instead of disabling test - mdscount=1 zconf_mount_clients $clients $MOUNT @@ -2052,9 +2042,9 @@ test_70b () { local start_ts=$(date +%s) local cmd="rundbench 1 -t $duration" local pid="" - if [ $mdscount -ge 2 ]; then - test_mkdir -p -c$mdscount $DIR/$tdir - $LFS setdirstripe -D -c$mdscount $DIR/$tdir + if [ $MDSCOUNT -ge 2 ]; then + test_mkdir -p -c$MDSCOUNT $DIR/$tdir + $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir fi do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \ PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \ @@ -2091,7 +2081,7 @@ test_70b () { log "$TESTNAME fail mds$fail_index $num_failovers times" fail mds$fail_index elapsed=$(($(date +%s) - start_ts)) - if [ $fail_index -ge $mdscount ]; then + if [ $fail_index -ge $MDSCOUNT ]; then fail_index=1 else fail_index=$((fail_index+1)) @@ -2275,7 +2265,7 @@ test_70e () { while true; do mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > \ /dev/null || { - echo "a->b fails" + echo "a->b fails" break; } @@ -2312,6 +2302,118 @@ test_70e () { } run_test 70e "rename cross-MDT with random fails" +test_70f_write_and_read(){ + local srcfile=$1 + local stopflag=$2 + local client + + echo "Write/read files in: '$DIR/$tdir', clients: '$CLIENTS' ..." + for client in ${CLIENTS//,/ }; do + [ -f $stopflag ] || return + + local tgtfile=$DIR/$tdir/$tfile.$client + do_node $client dd $DD_OPTS bs=1M count=10 if=$srcfile \ + of=$tgtfile 2>/dev/null || + error "dd $DD_OPTS bs=1M count=10 if=$srcfile " \ + "of=$tgtfile failed on $client, rc=$?" + done + + local prev_client=$(echo ${CLIENTS//,/ } | awk '{ print $NF }') + local index=0 + + for client in ${CLIENTS//,/ }; do + [ -f $stopflag ] || return + + # flush client cache in case test is running on only one client + # do_node $client cancel_lru_locks osc + do_node $client $LCTL set_param ldlm.namespaces.*.lru_size=clear + + tgtfile=$DIR/$tdir/$tfile.$client + local md5=$(do_node $prev_client "md5sum $tgtfile") + [ ${checksum[$index]// */} = ${md5// */} ] || + error "$tgtfile: checksum doesn't match on $prev_client" + index=$((index + 1)) + prev_client=$client + done +} + +test_70f_loop(){ + local srcfile=$1 + local stopflag=$2 + DD_OPTS= + + mkdir -p $DIR/$tdir || error "cannot create $DIR/$tdir directory" + $SETSTRIPE -c -1 $DIR/$tdir || error "cannot $SETSTRIPE $DIR/$tdir" + + touch $stopflag + while [ -f $stopflag ]; do + test_70f_write_and_read $srcfile $stopflag + # use direct IO and buffer cache in turns if loop + [ -n "$DD_OPTS" ] && DD_OPTS="" || DD_OPTS="oflag=direct" + done +} + +test_70f_cleanup() { + trap 0 + rm -f $TMP/$tfile.stop + do_nodes $CLIENTS rm -f $TMP/$tfile + rm -f $DIR/$tdir/$tfile.* +} + +test_70f() { +# [ x$ost1failover_HOST = x$ost_HOST ] && +# { skip "Failover host not defined" && return; } +# [ -z "$CLIENTS" ] && +# { skip "CLIENTS are not specified." && return; } +# [ $CLIENTCOUNT -lt 2 ] && +# { skip "Need 2 or more clients, have $CLIENTCOUNT" && return; } + + echo "mount clients $CLIENTS ..." + zconf_mount_clients $CLIENTS $MOUNT + + local srcfile=$TMP/$tfile + local client + local index=0 + + trap test_70f_cleanup EXIT + # create a different source file local to each client node so we can + # detect if the file wasn't written out properly after failover + do_nodes $CLIENTS dd bs=1M count=10 if=/dev/urandom of=$srcfile \ + 2>/dev/null || error "can't create $srcfile on $CLIENTS" + for client in ${CLIENTS//,/ }; do + checksum[$index]=$(do_node $client "md5sum $srcfile") + index=$((index + 1)) + done + + local duration=120 + [ "$SLOW" = "no" ] && duration=60 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + local stopflag=$TMP/$tfile.stop + test_70f_loop $srcfile $stopflag & + local pid=$! + + local elapsed=0 + local num_failovers=0 + local start_ts=$SECONDS + while [ $elapsed -lt $duration ]; do + sleep 3 + replay_barrier ost1 + sleep 1 + num_failovers=$((num_failovers + 1)) + log "$TESTNAME failing OST $num_failovers times" + fail ost1 + sleep 2 + elapsed=$((SECONDS - start_ts)) + done + + rm -f $stopflag + wait $pid + test_70f_cleanup +} +run_test 70f "OSS O_DIRECT recovery with $CLIENTCOUNT clients" + cleanup_71a() { trap 0 kill -9 $mkdir_71a_pid @@ -2952,7 +3054,7 @@ test_85a() { #bug 16774 done lov_id=$(lctl dl | grep "clilov") - addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}') + addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}') count=$(lctl get_param -n \ ldlm.namespaces.*MDT0000*$addr.lock_unused_count) echo "before recovery: unused locks count = $count" @@ -2972,7 +3074,7 @@ run_test 85a "check the cancellation of unused locks during recovery(IBITS)" test_85b() { #bug 16774 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1" - do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME || + create_pool $FSNAME.$TESTNAME || error "unable to create pool $TESTNAME" do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $FSNAME-OST0000 || error "unable to add pool $TESTNAME" @@ -2992,11 +3094,11 @@ test_85b() { #bug 16774 done lov_id=$(lctl dl | grep "clilov") - addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}') - count=$(lctl get_param \ - -n ldlm.namespaces.*OST0000*$addr.lock_unused_count) + addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}') + count=$(lctl get_param -n \ + ldlm.namespaces.*OST0000*$addr.lock_unused_count) echo "before recovery: unused locks count = $count" - [ $count != 0 ] || error "unused locks ($count) should be zero" + [ $count -ne 0 ] || error "unused locks ($count) should be zero" fail ost1 @@ -3025,7 +3127,7 @@ test_86() { } run_test 86 "umount server after clear nid_stats should not hit LBUG" -test_87() { +test_87a() { do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0" replay_barrier ost1 @@ -3041,7 +3143,7 @@ test_87() { error "New checksum $cksum2 does not match original $cksum" fi } -run_test 87 "write replay" +run_test 87a "write replay" test_87b() { do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0" @@ -3192,6 +3294,8 @@ test_90() { # bug 19494 return 0 fi fi + # ensure all OSTs are active to allow allocations + wait_osts_up mkdir $dir || error "mkdir $dir failed" @@ -3263,7 +3367,7 @@ test_90() { # bug 19494 } run_test 90 "lfs find identifies the missing striped file segments" -test_93() { +test_93a() { local server_version=$(lustre_version_code $SINGLEMDS) [[ $server_version -ge $(version_code 2.6.90) ]] || [[ $server_version -ge $(version_code 2.5.4) && @@ -3285,7 +3389,28 @@ test_93() { do_facet ost1 "$LCTL set_param fail_loc=0x715" fail ost1 } -run_test 93 "replay + reconnect" +run_test 93a "replay + reconnect" + +test_93b() { + local server_version=$(lustre_version_code $SINGLEMDS) + [[ $server_version -ge $(version_code 2.7.90) ]] || + { skip "Need MDS version 2.7.90+"; return; } + + cancel_lru_locks mdc + + createmany -o $DIR/$tfile 20 || + error "createmany -o $DIR/$tfile failed" + + #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715 + # We need to emulate a state that MDT is waiting for other clients + # not completing the recovery. Final ping is queued, but reply will be + # sent on the recovery completion. It is done by sleep before + # processing final pings + do_facet mds1 "$LCTL set_param fail_val=80" + do_facet mds1 "$LCTL set_param fail_loc=0x715" + fail mds1 +} +run_test 93b "replay + reconnect on mds" striped_dir_check_100() { local striped_dir=$DIR/$tdir/striped_dir