X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-small.sh;h=45cf76edef70eee97b37e781999c3196f5a88ad9;hp=2f3e76794909fd8f9a9b7b09455b7d278a273d7d;hb=f11196a0d0e53ec5f637d0442d6318b310e21ef0;hpb=8c4f96f910786ff3d73474ef5f8d4a96a30a0bed diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 2f3e767..45cf76e 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3,8 +3,9 @@ set -e # bug 5494 5493 -ALWAYS_EXCEPT="24 52 $RECOVERY_SMALL_EXCEPT" +ALWAYS_EXCEPT="24 52 $RECOVERY_SMALL_EXCEPT" +export MULTIOP=${MULTIOP:-multiop} PTLDEBUG=${PTLDEBUG:--1} LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh @@ -31,61 +32,87 @@ assert_DIR rm -rf $DIR/[df][0-9]* test_1() { - drop_request "mcreate $DIR/f1" || return 1 - drop_reint_reply "mcreate $DIR/f2" || return 2 -} -run_test 1 "mcreate: drop req, drop rep" + local f1="$DIR/$tfile" + local f2="$DIR/$tfile.2" -test_2() { - drop_request "tchmod 111 $DIR/f2" || return 1 - drop_reint_reply "tchmod 666 $DIR/f2" || return 2 -} -run_test 2 "chmod: drop req, drop rep" + drop_request "mcreate $f1" || + error_noexit "create '$f1': drop req" -test_3() { - drop_request "statone $DIR/f2" || return 1 - drop_reply "statone $DIR/f2" || return 2 -} -run_test 3 "stat: drop req, drop rep" + drop_reint_reply "mcreate $f2" || + error_noexit "create '$f2': drop rep" + + drop_request "tchmod 111 $f2" || + error_noexit "chmod '$f2': drop req" + + drop_reint_reply "tchmod 666 $f2" || + error_noexit "chmod '$f2': drop rep" -SAMPLE_NAME=f0.recovery-small.junk -SAMPLE_FILE=$TMP/$SAMPLE_NAME -# make this big, else test 9 doesn't wait for bulk -- bz 5595 -dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4 + drop_request "statone $f2" || + error_noexit "stat '$f2': drop req" + + drop_reply "statone $f2" || + error_noexit "stat '$f2': drop rep" +} +run_test 1 "create, chmod, stat: drop req, drop rep" test_4() { - do_facet client "cp $SAMPLE_FILE $DIR/$SAMPLE_NAME" || return 1 - drop_request "cat $DIR/$SAMPLE_NAME > /dev/null" || return 2 - drop_reply "cat $DIR/$SAMPLE_NAME > /dev/null" || return 3 + local t=$DIR/$tfile + do_facet_create_file client $t 10K || + error_noexit "Create file $t" + + drop_request "cat $t > /dev/null" || + error_noexit "Open request for $t file" + + drop_reply "cat $t > /dev/null" || + error_noexit "Open replay for $t file" } run_test 4 "open: drop req, drop rep" -RENAMED_AGAIN=$DIR/f0.renamed-again - test_5() { - drop_request "mv $DIR/$SAMPLE_NAME $DIR/$tfile-renamed" || return 1 - drop_reint_reply "mv $DIR/$tfile-renamed $RENAMED_AGAIN" || return 2 - do_facet client "checkstat -v $RENAMED_AGAIN" || return 3 + local T=$DIR/$tfile + local R="$T-renamed" + local RR="$T-renamed-again" + do_facet_create_file client $T 10K || + error_noexit "Create file $T" + + drop_request "mv $T $R" || + error_noexit "Rename $T" + + drop_reint_reply "mv $R $RR" || + error_noexit "Failed rename replay on $R" + + do_facet client "checkstat -v $RR" || + error_noexit "checkstat error on $RR" + + do_facet client "rm $RR" || + error_noexit "Can't remove file $RR" } run_test 5 "rename: drop req, drop rep" -[ ! -e $RENAMED_AGAIN ] && cp $SAMPLE_FILE $RENAMED_AGAIN -LINK1=$DIR/f0.link1 -LINK2=$DIR/f0.link2 - test_6() { - drop_request "mlink $RENAMED_AGAIN $LINK1" || return 1 - drop_reint_reply "mlink $RENAMED_AGAIN $LINK2" || return 2 -} -run_test 6 "link: drop req, drop rep" + local T=$DIR/$tfile + local LINK1=$DIR/$tfile.link1 + local LINK2=$DIR/$tfile.link2 + + do_facet_create_file client $T 10K || + error_noexit "Create file $T" -[ ! -e $LINK1 ] && mlink $RENAMED_AGAIN $LINK1 -[ ! -e $LINK2 ] && mlink $RENAMED_AGAIN $LINK2 -test_7() { - drop_request "munlink $LINK1" || return 1 - drop_reint_reply "munlink $LINK2" || return 2 + drop_request "mlink $T $LINK1" || + error_noexit "mlink request for $T" + + drop_reint_reply "mlink $T $LINK2" || + error_noexit "mlink reply for $T" + + drop_request "munlink $LINK1" || + error_noexit "munlink request for $T" + + drop_reint_reply "munlink $LINK2" || + error_noexit "munlink reply for $T" + + do_facet client "rm $T" || + error_noexit "Can't remove file $T" } -run_test 7 "unlink: drop req, drop rep" +run_test 6 "link, unlink: drop req, drop rep" #bug 1423 test_8() { @@ -95,12 +122,25 @@ run_test 8 "touch: drop rep (bug 1423)" #bug 1420 test_9() { - remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + remote_ost_nodsh && skip "remote OST with nodsh" && return 0 - pause_bulk "cp /etc/profile $DIR/$tfile" || return 1 - do_facet client "cp $SAMPLE_FILE $DIR/${tfile}.2" || return 2 - do_facet client "sync" - do_facet client "rm $DIR/$tfile $DIR/${tfile}.2" || return 3 + local t1=${tfile}.1 + local t2=${tfile}.2 + do_facet_random_file client $TMP/$tfile 1K || + error_noexit "Create random file $TMP/$tfile" + # make this big, else test 9 doesn't wait for bulk -- bz 5595 + do_facet_create_file client $TMP/$t1 4M || + error_noexit "Create file $TMP/$t1" + do_facet client "cp $TMP/$t1 $DIR/$t1" || + error_noexit "Can't copy to $DIR/$t1 file" + pause_bulk "cp $TMP/$tfile $DIR/$tfile" || + error_noexit "Can't pause_bulk copy" + do_facet client "cp $TMP/$t1 $DIR/$t2" || + error_noexit "Can't copy file" + do_facet client "sync" + do_facet client "rm $DIR/$tfile $DIR/$t2 $DIR/$t1" || + error_noexit "Can't remove files" + do_facet client "rm $TMP/$t1 $TMP/$tfile" } run_test 9 "pause bulk on OST (bug 1420)" @@ -120,13 +160,13 @@ run_test 10 "finish request on server after client eviction (bug 1521)" #bug 2460 # wake up a thread waiting for completion after eviction test_11(){ - do_facet client multiop $DIR/$tfile Ow || return 1 - do_facet client multiop $DIR/$tfile or || return 2 + do_facet client $MULTIOP $DIR/$tfile Ow || return 1 + do_facet client $MULTIOP $DIR/$tfile or || return 2 cancel_lru_locks osc - do_facet client multiop $DIR/$tfile or || return 3 - drop_bl_callback multiop $DIR/$tfile Ow || echo "evicted as expected" + do_facet client $MULTIOP $DIR/$tfile or || return 3 + drop_bl_callback $MULTIOP $DIR/$tfile Ow || echo "evicted as expected" do_facet client munlink $DIR/$tfile || return 4 } @@ -134,7 +174,7 @@ run_test 11 "wake up a thread waiting for completion after eviction (b=2460)" #b=2494 test_12(){ - $LCTL mark multiop $DIR/$tfile OS_c + $LCTL mark $MULTIOP $DIR/$tfile OS_c do_facet $SINGLEMDS "lctl set_param fail_loc=0x115" clear_failloc $SINGLEMDS $((TIMEOUT * 2)) & multiop_bg_pause $DIR/$tfile OS_c || return 1 @@ -187,22 +227,25 @@ start_read_ahead() { } test_16() { - remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + remote_ost_nodsh && skip "remote OST with nodsh" && return 0 - do_facet client cp $SAMPLE_FILE $DIR - sync - stop_read_ahead + do_facet_random_file client $TMP/$tfile 100K || + { error_noexit "Create random file $TMP/$T" ; return 0; } + do_facet client "cp $TMP/$tfile $DIR/$tfile" || + { error_noexit "Copy to $DIR/$tfile file" ; return 0; } + sync + stop_read_ahead #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE - do_facet ost1 "lctl set_param fail_loc=0x80000504" - cancel_lru_locks osc - # OST bulk will time out here, client resends - do_facet client "cmp $SAMPLE_FILE $DIR/${SAMPLE_FILE##*/}" || return 1 - do_facet ost1 lctl set_param fail_loc=0 - # give recovery a chance to finish (shouldn't take long) - sleep $TIMEOUT - do_facet client "cmp $SAMPLE_FILE $DIR/${SAMPLE_FILE##*/}" || return 2 - start_read_ahead + do_facet ost1 "lctl set_param fail_loc=0x80000504" + cancel_lru_locks osc + # OST bulk will time out here, client resends + do_facet client "cmp $TMP/$tfile $DIR/$tfile" || return 1 + do_facet ost1 lctl set_param fail_loc=0 + # give recovery a chance to finish (shouldn't take long) + sleep $TIMEOUT + do_facet client "cmp $TMP/$tfile $DIR/$tfile" || return 2 + start_read_ahead } run_test 16 "timeout bulk put, don't evict client (2732)" @@ -211,6 +254,10 @@ test_17() { remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + local SAMPLE_FILE=$TMP/$tfile + do_facet_random_file client $SAMPLE_FILE 20K || + { error_noexit "Create random file $SAMPLE_FILE" ; return 0; } + # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max if at_is_enabled; then at_max_saved=$(at_max_get ost1) @@ -241,6 +288,9 @@ run_test 17 "timeout bulk get, don't evict client (2732)" test_18a() { [ -z ${ost2_svc} ] && skip_env "needs 2 osts" && return 0 + do_facet_create_file client $TMP/$tfile 20K || + { error_noexit "Create file $TMP/$tfile" ; return 0; } + do_facet client mkdir -p $DIR/$tdir f=$DIR/$tdir/$tfile @@ -248,14 +298,14 @@ test_18a() { pgcache_empty || return 1 # 1 stripe on ost2 - lfs setstripe $f -s $((128 * 1024)) -i 1 -c 1 - get_stripe_info client $f + $LFS setstripe -i 1 -c 1 $f + stripe_index=$($LFS getstripe -i $f) if [ $stripe_index -ne 1 ]; then - lfs getstripe $f - error "$f: different stripe offset ($stripe_index)" && return + $LFS getstripe $f + error "$f: stripe_index $stripe_index != 1" && return fi - do_facet client cp $SAMPLE_FILE $f + do_facet client cp $TMP/$tfile $f sync local osc2dev=`lctl get_param -n devices | grep ${ost2_svc}-osc- | egrep -v 'MDT' | awk '{print $1}'` $LCTL --device $osc2dev deactivate || return 3 @@ -272,21 +322,23 @@ run_test 18a "manual ost invalidate clears page cache immediately" test_18b() { remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + do_facet_create_file client $TMP/$tfile 20K || + { error_noexit "Create file $TMP/$tfile" ; return 0; } + do_facet client mkdir -p $DIR/$tdir f=$DIR/$tdir/$tfile cancel_lru_locks osc pgcache_empty || return 1 - # shouldn't have to set stripe size of count==1 - lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1 - get_stripe_info client $f + $LFS setstripe -i 0 -c 1 $f + stripe_index=$($LFS getstripe -i $f) if [ $stripe_index -ne 0 ]; then - lfs getstripe $f - error "$f: different stripe offset ($stripe_index)" && return + $LFS getstripe $f + error "$f: stripe_index $stripe_index != 0" && return fi - do_facet client cp $SAMPLE_FILE $f + do_facet client cp $TMP/$tfile $f sync ost_evict_client # allow recovery to complete @@ -303,21 +355,23 @@ run_test 18b "eviction and reconnect clears page cache (2766)" test_18c() { remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + do_facet_create_file client $TMP/$tfile 20K || + { error_noexit "Create file $TMP/$tfile" ; return 0; } + do_facet client mkdir -p $DIR/$tdir f=$DIR/$tdir/$tfile cancel_lru_locks osc pgcache_empty || return 1 - # shouldn't have to set stripe size of count==1 - lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1 - get_stripe_info client $f + $LFS setstripe -i 0 -c 1 $f + stripe_index=$($LFS getstripe -i $f) if [ $stripe_index -ne 0 ]; then - lfs getstripe $f - error "$f: different stripe offset ($stripe_index)" && return + $LFS getstripe $f + error "$f: stripe_index $stripe_index != 0" && return fi - do_facet client cp $SAMPLE_FILE $f + do_facet client cp $TMP/$tfile $f sync ost_evict_client @@ -325,6 +379,7 @@ test_18c() { # lost reply to connect request do_facet ost1 lctl set_param fail_loc=0x80000225 # force reconnect + sleep 1 df $MOUNT > /dev/null 2>&1 sleep 2 # my understanding is that there should be nothing in the page @@ -337,28 +392,43 @@ test_18c() { run_test 18c "Dropped connect reply after eviction handing (14755)" test_19a() { - f=$DIR/$tfile - do_facet client mcreate $f || return 1 - drop_ldlm_cancel "chmod 0777 $f" || echo "evicted as expected" + local BEFORE=`date +%s` + local EVICT - do_facet client checkstat -v -p 0777 $f || echo evicted - # let the client reconnect - sleep 5 - do_facet client "munlink $f" + mount_client $DIR2 + + do_facet client mcreate $DIR/$tfile || return 1 + drop_ldlm_cancel "chmod 0777 $DIR2" + + umount_client $DIR2 + do_facet client "munlink $DIR/$tfile" + + # let the client reconnect + sleep 5 + EVICT=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | \ + awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }') + + [ ! -z "$EVICT" ] && [[ $EVICT -gt $BEFORE ]] || error "no eviction" } run_test 19a "test expired_lock_main on mds (2867)" test_19b() { - f=$DIR/$tfile - do_facet client multiop $f Ow || return 1 - do_facet client multiop $f or || return 2 + local BEFORE=`date +%s` + local EVICT - cancel_lru_locks osc + mount_client $DIR2 - do_facet client multiop $f or || return 3 - drop_ldlm_cancel multiop $f Ow || echo "client evicted, as expected" + do_facet client $MULTIOP $DIR/$tfile Ow || return 1 + drop_ldlm_cancel $MULTIOP $DIR2/$tfile Ow + umount_client $DIR2 + do_facet client munlink $DIR/$tfile + + # let the client reconnect + sleep 5 + EVICT=$(do_facet client $LCTL get_param osc.$FSNAME-OST*.state | \ + awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }') - do_facet client munlink $f || return 4 + [ ! -z "$EVICT" ] && [[ $EVICT -gt $BEFORE ]] || error "no eviction" } run_test 19b "test expired_lock_main on ost (2867)" @@ -366,7 +436,7 @@ test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup remote_ost_nodsh && skip "remote OST with nodsh" && return 0 mkdir -p $DIR/$tdir - lfs setstripe $DIR/$tdir/${tfile} -i 0 -c 1 + $LFS setstripe -i 0 -c 1 $DIR/$tdir/${tfile} multiop_bg_pause $DIR/$tdir/${tfile} O_wc || return 1 MULTI_PID=$! cancel_lru_locks osc @@ -383,7 +453,7 @@ test_20b() { # bug 2986 - ldlm_handle_enqueue error during open remote_ost_nodsh && skip "remote OST with nodsh" && return 0 mkdir -p $DIR/$tdir - lfs setstripe $DIR/$tdir/${tfile} -i 0 -c 1 + $LFS setstripe -i 0 -c 1 $DIR/$tdir/${tfile} cancel_lru_locks osc #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 do_facet ost1 lctl set_param fail_loc=0x80000308 @@ -399,7 +469,7 @@ test_21a() { close_pid=$! do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000129" - multiop $DIR/$tdir-2/f Oc & + $MULTIOP $DIR/$tdir-2/f Oc & open_pid=$! sleep 1 do_facet $SINGLEMDS "lctl set_param fail_loc=0" @@ -474,7 +544,7 @@ test_21d() { pid=$! do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000129" - multiop $DIR/$tdir-2/f Oc & + $MULTIOP $DIR/$tdir-2/f Oc & sleep 1 do_facet $SINGLEMDS "lctl set_param fail_loc=0" @@ -591,11 +661,11 @@ test_22() { f2=$DIR/${tfile}-2 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115" - multiop $f2 Oc & + $MULTIOP $f2 Oc & close_pid=$! sleep 1 - multiop $f1 msu || return 1 + $MULTIOP $f1 msu || return 1 cancel_lru_locks mdc do_facet $SINGLEMDS "lctl set_param fail_loc=0" @@ -624,7 +694,7 @@ test_24() { # bug 11710 details correct fsync() behavior remote_ost_nodsh && skip "remote OST with nodsh" && return 0 mkdir -p $DIR/$tdir - lfs setstripe $DIR/$tdir -s 0 -i 0 -c 1 + $LFS setstripe -i 0 -c 1 $DIR/$tdir cancel_lru_locks osc multiop_bg_pause $DIR/$tdir/$tfile Owy_wyc || return 1 MULTI_PID=$! @@ -643,7 +713,9 @@ wait_client_evicted () { local exports=$2 local varsvc=${facet}_svc - wait_update $(facet_host $facet) "lctl get_param -n *.${!varsvc}.num_exports | cut -d' ' -f2" $((exports - 1)) $3 + wait_update $(facet_active_host $facet) \ + "lctl get_param -n *.${!varsvc}.num_exports | cut -d' ' -f2" \ + $((exports - 1)) $3 } test_26a() { # was test_26 bug 5921 - evict dead exports by pinger @@ -890,7 +962,7 @@ test_55() { mkdir -p $DIR/$tdir # first dd should be finished quickly - lfs setstripe $DIR/$tdir/$tfile-1 -c 1 -i 0 + $LFS setstripe -c 1 -i 0 $DIR/$tdir/$tfile-1 dd if=/dev/zero of=$DIR/$tdir/$tfile-1 bs=32M count=4 & DDPID=$! count=0 @@ -905,7 +977,7 @@ test_55() { done echo "(dd_pid=$DDPID, time=$count)successful" - lfs setstripe $DIR/$tdir/$tfile-2 -c 1 -i 0 + $LFS setstripe -c 1 -i 0 $DIR/$tdir/$tfile-2 #define OBD_FAIL_OST_DROP_REQ 0x21d do_facet ost1 lctl set_param fail_loc=0x0000021d # second dd will be never finished @@ -987,7 +1059,10 @@ test_58() { # bug 11546 lctl set_param fail_loc=0 drop_bl_callback rm -f $DIR/$tfile wait $pid - do_facet client "df $DIR" + # the first 'df' could tigger the eviction caused by + # 'drop_bl_callback', and it's normal case. + # but the next 'df' should return successfully. + do_facet client "df $DIR" || do_facet client "df $DIR" } run_test 58 "Eviction in the middle of open RPC reply processing" @@ -1080,7 +1155,7 @@ test_61() mkdir -p $DIR/$tdir || error "mkdir dir $DIR/$tdir failed" # Set the default stripe of $DIR/$tdir to put the files to ost1 - $LFS setstripe -c 1 --index 0 $DIR/$tdir + $LFS setstripe -c 1 -i 0 $DIR/$tdir replay_barrier $SINGLEMDS createmany -o $DIR/$tdir/$tfile-%d 10 @@ -1089,7 +1164,7 @@ test_61() fail_abort $SINGLEMDS touch $DIR/$tdir/$tfile - local id=`$LFS getstripe $DIR/$tdir/$tfile |awk '($1 ~ 0 && $2 ~ /^[1-9]+/) {print $2}'` + local id=`$LFS getstripe $DIR/$tdir/$tfile | awk '$1 == 0 { print $2 }'` [ $id -le $oid ] && error "the orphan objid was reused, failed" # Cleanup @@ -1097,6 +1172,370 @@ test_61() } run_test 61 "Verify to not reuse orphan objects - bug 17025" +check_cli_ir_state() +{ + local NODE=${1:-$HOSTNAME} + local st + st=$(do_node $NODE "lctl get_param mgc.*.ir_state | + awk '/imperative_recovery:/ { print \\\$2}'") + [ $st != ON -o $st != OFF -o $st != ENABLED -o $st != DISABLED ] || + error "Error state $st, must be ENABLED or DISABLED" + echo -n $st +} + +check_target_ir_state() +{ + local target=${1} + local name=${target}_svc + local recovery_proc=obdfilter.${!name}.recovery_status + local st + + st=$(do_facet $target "lctl get_param -n $recovery_proc | + awk '/IR:/{ print \\\$2}'") + [ $st != ON -o $st != OFF -o $st != ENABLED -o $st != DISABLED ] || + error "Error state $st, must be ENABLED or DISABLED" + echo -n $st +} + +set_ir_status() +{ + do_facet mgs lctl set_param -n mgs.MGS.live.$FSNAME="state=$1" +} + +get_ir_status() +{ + local state=$(do_facet mgs "lctl get_param -n mgs.MGS.live.$FSNAME | + awk '/state:/{ print \\\$2 }'") + echo -n ${state/,/} +} + +nidtbl_version_mgs() +{ + local ver=$(do_facet mgs "lctl get_param -n mgs.MGS.live.$FSNAME | + awk '/nidtbl_version:/{ print \\\$2 }'") + echo -n $ver +} + +# nidtbl_version_client [node] +nidtbl_version_client() +{ + local cli=$1 + local node=${2:-$HOSTNAME} + + if [ X$cli = Xclient ]; then + cli=$FSNAME-client + else + local obdtype=${cli/%[0-9]*/} + [ $obdtype != mds ] && error "wrong parameters $cli" + + node=$(facet_active_host $cli) + local t=${cli}_svc + cli=${!t} + fi + + local vers=$(do_node $node "lctl get_param -n mgc.*.ir_state" | + awk "/$cli/{print \$6}" |sort -u) + + # in case there are multiple mounts on the client node + local arr=($vers) + [ ${#arr[@]} -ne 1 ] && error "versions on client node mismatch" + echo -n $vers +} + +nidtbl_versions_match() +{ + [ $(nidtbl_version_mgs) -eq $(nidtbl_version_client ${1:-client}) ] +} + +target_instance_match() +{ + local srv=$1 + local obdtype + local cliname + + obdtype=${srv/%[0-9]*/} + case $obdtype in + mds) + obdname="mdt" + cliname="mdc" + ;; + ost) + obdname="obdfilter" + cliname="osc" + ;; + *) + error "invalid target type" $srv + return 1 + ;; + esac + + local target=${srv}_svc + local si=$(do_facet $srv lctl get_param -n $obdname.${!target}.instance) + local ci=$(lctl get_param -n $cliname.${!target}-${cliname}-*.import | \ + awk '/instance/{ print $2 }' |head -1) + + return $([ $si -eq $ci ]) +} + +test_100() +{ + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + { skip "MGS without IR support"; return 0; } + + # MDT was just restarted in the previous test, make sure everything + # is all set. + local cnt=30 + while [ $cnt -gt 0 ]; do + nidtbl_versions_match && break + sleep 1 + cnt=$((cnt - 1)) + done + + # disable IR + set_ir_status disabled + + local prev_ver=$(nidtbl_version_client client) + + local saved_FAILURE_MODE=$FAILURE_MODE + [ $(facet_host mgs) = $(facet_host ost1) ] && FAILURE_MODE="SOFT" + fail ost1 + + # valid check + [ $(nidtbl_version_client client) -eq $prev_ver ] || + error "version must not change due to IR disabled" + target_instance_match ost1 || error "instance mismatch" + + # restore env + set_ir_status full + FAILURE_MODE=$saved_FAILURE_MODE +} +run_test 100 "IR: Make sure normal recovery still works w/o IR" + +test_101() +{ + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + { skip "MGS without IR support"; return 0; } + + set_ir_status full + + local OST1_IMP=$(get_osc_import_name client ost1) + + # disable pinger recovery + lctl set_param -n osc.$OST1_IMP.pinger_recov=0 + + fail ost1 + + target_instance_match ost1 || error "instance mismatch" + nidtbl_versions_match || error "version must match" + + lctl set_param -n osc.$OST1_IMP.pinger_recov=1 +} +run_test 101 "IR: Make sure IR works w/o normal recovery" + +test_102() +{ + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + { skip "MGS without IR support"; return 0; } + + local clients=${CLIENTS:-$HOSTNAME} + local old_version + local new_version + local mgsdev=mgs + + set_ir_status full + + # let's have a new nidtbl version + fail ost1 + + # sleep for a while so that clients can see the failure of ost + # it must be MGC_TIMEOUT_MIN_SECONDS + MGC_TIMEOUT_RAND_CENTISEC. + # int mgc_request.c: + # define MGC_TIMEOUT_MIN_SECONDS 5 + # define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 * + local count=30 # 20 seconds at most + while [ $count -gt 0 ]; do + nidtbl_versions_match && break + sleep 1 + count=$((count-1)) + done + + nidtbl_versions_match || error "nidtbl mismatch" + + # get the version # + old_version=$(nidtbl_version_client client) + + zconf_umount_clients $clients $MOUNT || error "Cannot umount client" + + # restart mgs + combined_mgs_mds && mgsdev=mds1 + remount_facet $mgsdev + fail ost1 + + zconf_mount_clients $clients $MOUNT || error "Cannot mount client" + + # check new version + new_version=$(nidtbl_version_client client) + [ $new_version -lt $old_version ] && + error "nidtbl version wrong after mgs restarts" + return 0 +} +run_test 102 "IR: New client gets updated nidtbl after MGS restart" + +test_103() +{ + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + { skip "MGS without IR support"; return 0; } + + combined_mgs_mds && skip "mgs and mds on the same target" && return 0 + + # workaround solution to generate config log on the mds + remount_facet mds1 + + stop mgs + stop mds1 + + # We need this test because mds is like a client in IR context. + start mds1 $MDSDEV1 || error "MDS should start w/o mgs" + + # start mgs and remount mds w/ ir + start mgs $MGSDEV + clients_up + + # remount client so that fsdb will be created on the MGS + umount_client $MOUNT || error "umount failed" + mount_client $MOUNT || error "mount failed" + + # sleep 30 seconds so the MDS has a chance to detect MGS restarting + local count=30 + while [ $count -gt 0 ]; do + [ $(nidtbl_version_client mds1) -ne 0 ] && break + sleep 1 + count=$((count-1)) + done + + # after a while, mds should be able to reconnect to mgs and fetch + # up-to-date nidtbl version + nidtbl_versions_match mds1 || error "mds nidtbl mismatch" + + # reset everything + set_ir_status full +} +run_test 103 "IR: MDS can start w/o MGS and get updated nidtbl later" + +test_104() +{ + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + { skip "MGS without IR support"; return 0; } + + set_ir_status full + + stop ost1 + start ost1 $(ostdevname 1) "$OST_MOUNT_OPTS -onoir" || + error "OST1 cannot start" + clients_up + + local ir_state=$(check_target_ir_state ost1) + [ $ir_state = "DISABLED" -o $ir_state = "OFF" ] || + error "ir status on ost1 should be DISABLED" +} +run_test 104 "IR: ost can disable IR voluntarily" + +test_105() +{ + [ -z "$RCLIENTS" ] && skip "Needs multiple clients" && return 0 + do_facet mgs $LCTL list_param mgs.*.ir_timeout || + { skip "MGS without IR support"; return 0; } + + set_ir_status full + + # get one of the clients from client list + local rcli=$(echo $RCLIENTS |cut -d' ' -f 1) + + local old_MOUNTOPT=$MOUNTOPT + MOUNTOPT=${MOUNTOPT},noir + zconf_umount $rcli $MOUNT || error "umount failed" + zconf_mount $rcli $MOUNT || error "mount failed" + + # make sure lustre mount at $rcli disabling IR + local ir_state=$(check_cli_ir_state $rcli) + [ $ir_state = "DISABLED" -o $ir_state = "OFF" ] || + error "IR state must be DISABLED at $rcli" + + # Since the client just mounted, its last_rcvd entry is not on disk. + # Send an RPC so exp_need_sync forces last_rcvd to commit this export + # so the client can reconnect during OST recovery (LU-924, LU-1582) + $SETSTRIPE -i 0 $DIR/$tfile + dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 conv=sync + + # make sure MGS's state is Partial + [ $(get_ir_status) = "partial" ] || error "MGS IR state must be partial" + + fail ost1 + # make sure IR on ost1 is DISABLED + local ir_state=$(check_target_ir_state ost1) + [ $ir_state = "DISABLED" -o $ir_state = "OFF" ] || + error "IR status on ost1 should be DISABLED" + + # restore it + MOUNTOPT=$old_MOUNTOPT + zconf_umount $rcli $MOUNT || error "umount failed" + zconf_mount $rcli $MOUNT || error "mount failed" + + # make sure MGS's state is full + [ $(get_ir_status) = "full" ] || error "MGS IR status must be full" + + fail ost1 + # make sure IR on ost1 is ENABLED + local ir_state=$(check_target_ir_state ost1) + [ $ir_state = "ENABLED" -o $ir_state = "ON" ] || + error "IR status on ost1 should be ENABLED" + + return 0 +} +run_test 105 "IR: NON IR clients support" + +cleanup_106() { + trap 0 + umount_client $DIR2 +} + +test_106() { # LU-1789 +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 + $LCTL set_param fail_loc=0x805 + + trap cleanup_106 EXIT + + # enable lightweight flag on mdc connection + mount_client $DIR2 + + local MDS_NEXP=$(do_facet $SINGLEMDS \ + lctl get_param -n mdt.${mds1_svc}.num_exports | + cut -d' ' -f2) + $LCTL set_param fail_loc=0 + + touch $DIR2/$tfile || error "failed to create empty file" + replay_barrier $SINGLEMDS + facet_failover $SINGLEMDS + + # lightweight connection must be evicted + touch -c $DIR2/$tfile || true + evicted=`dmesg | awk '/test 106/ {start = 1;} + /This client was evicted by .*MDT0000/ { + if (start) { + print; + } + }'` + [ -z "$evicted" ] && error "lightweight client not evicted by mds" + + # and all operations performed by lightweight client should be + # synchronous, so the file created before mds restart should be there + $CHECKSTAT -t file $DIR/$tfile || error "file not present" + rm -f $DIR/$tfile + + cleanup_106 +} +run_test 106 "lightweight connection support" + complete $(basename $0) $SECONDS check_and_cleanup_lustre exit_status