init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+# also long tests: 19, 21a, 21e, 21f, 23, 27
+# 1 2.5 2.5 4 4 (min)"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="17 26a 26b 50 51 57"
+
build_test_filter
# Allow us to override the setup if we already have a mounted system by
CLEANUP=${CLEANUP:-""}
cleanup_and_setup_lustre
+rm -rf $DIR/[df][0-9]*
+
+SAMPLE_NAME=recovery-small.junk
+SAMPLE_FILE=$TMP/$SAMPLE_NAME
+# make this big, else test 9 doesn't wait for bulk -- bz 5595
+dd if=/dev/urandom of=$SAMPLE_FILE bs=1M count=4
test_1() {
drop_request "mcreate $MOUNT/1" || return 1
#bug 1420
test_9() {
pause_bulk "cp /etc/profile $MOUNT/$tfile" || return 1
- do_facet client "cp /etc/termcap $MOUNT/${tfile}.2" || return 2
+ do_facet client "cp ${SAMPLE_FILE} $MOUNT/${tfile}.2" || return 2
do_facet client "sync"
do_facet client "rm $MOUNT/$tfile $MOUNT/${tfile}.2" || return 3
}
$LCTL mark multiop $MOUNT/$tfile OS_c
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x115"
clear_failloc $SINGLEMDS $((TIMEOUT * 2)) &
- multiop $MOUNT/$tfile OS_c &
+ multiop_bg_pause $MOUNT/$tfile OS_c || return 1
PID=$!
#define OBD_FAIL_MDS_CLOSE_NET 0x115
- sleep 2
kill -USR1 $PID
echo "waiting for multiop $PID"
wait $PID || return 2
}
run_test 15 "failed open (-ENOMEM)"
-READ_AHEAD=`cat $LPROC/llite/*/max_read_ahead_mb | head -n 1`
+READ_AHEAD=`lctl get_param -n llite.*.max_read_ahead_mb | head -n 1`
stop_read_ahead() {
- for f in $LPROC/llite/*/max_read_ahead_mb; do
- echo 0 > $f
- done
+ lctl set_param -n llite.*.max_read_ahead_mb 0
}
start_read_ahead() {
- for f in $LPROC/llite/*/max_read_ahead_mb; do
- echo $READ_AHEAD > $f
- done
+ lctl set_param -n llite.*.max_read_ahead_mb $READ_AHEAD
}
test_16() {
pgcache_empty || return 1
# 1 stripe on ost2
- lfs setstripe $f $((128 * 1024)) 1 1
+ lfs setstripe $f -s $((128 * 1024)) -i 1 -c 1
- do_facet client cp /etc/termcap $f
+ do_facet client cp $SAMPLE_FILE $f
sync
- local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | egrep -v 'MDT' | awk '{print $1}'`
+ local osc2dev=`lctl get_param -n devices | grep ${ost2_svc}-osc- | egrep -v 'MDT' | awk '{print $1}'`
$LCTL --device $osc2dev deactivate || return 3
# my understanding is that there should be nothing in the page
# cache after the client reconnects?
pgcache_empty || return 1
# shouldn't have to set stripe size of count==1
- lfs setstripe $f $((128 * 1024)) 0 1
- lfs setstripe $f2 $((128 * 1024)) 0 1
+ lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1
+ lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1
- do_facet client cp /etc/termcap $f
+ do_facet client cp $SAMPLE_FILE $f
sync
ost_evict_client
# allow recovery to complete
}
run_test 18b "eviction and reconnect clears page cache (2766)"
+test_18c() {
+ do_facet client mkdir -p $MOUNT/$tdir
+ f=$MOUNT/$tdir/$tfile
+ f2=$MOUNT/$tdir/${tfile}-2
+
+ cancel_lru_locks osc
+ pgcache_empty || return 1
+
+ # shouldn't have to set stripe size of count==1
+ lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1
+ lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1
+
+ do_facet client cp $SAMPLE_FILE $f
+ sync
+ ost_evict_client
+
+ # OBD_FAIL_OST_CONNECT_NET2
+ # lost reply to connect request
+ do_facet ost1 sysctl -w lustre.fail_loc=0x80000225
+ # force reconnect
+ df $MOUNT > /dev/null 2>&1
+ sleep 2
+ # my understanding is that there should be nothing in the page
+ # cache after the client reconnects?
+ rc=0
+ pgcache_empty || rc=2
+ rm -f $f $f2
+ return $rc
+}
+run_test 18c "Dropped connect reply after eviction handing (14755)"
+
test_19a() {
f=$MOUNT/$tfile
do_facet client mcreate $f || return 1
test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup
mkdir -p $DIR/$tdir
- multiop $DIR/$tdir/${tfile} O_wc &
+ multiop_bg_pause $DIR/$tdir/${tfile} O_wc || return 1
MULTI_PID=$!
- sleep 1
cancel_lru_locks osc
#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
do_facet ost1 sysctl -w lustre.fail_loc=0x80000308
test_21a() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
close_pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000129"
test_21b() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
close_pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107"
test_21c() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
close_pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107"
test_21d() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000129"
test_21e() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000119"
test_21f() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000119"
test_21g() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000119"
test_21h() {
mkdir -p $DIR/$tdir-1
mkdir -p $DIR/$tdir-2
- multiop $DIR/$tdir-1/f O_c &
+ multiop_bg_pause $DIR/$tdir-1/f O_c || return 1
pid=$!
do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107"
run_test 22 "drop close request and do mknod"
test_23() { #b=4561
- multiop $DIR/$tfile O_c &
+ multiop_bg_pause $DIR/$tfile O_c || return 1
pid=$!
# give a chance for open
sleep 5
test_24() { # bug 2248 - eviction fails writeback but app doesn't see it
mkdir -p $DIR/$tdir
cancel_lru_locks osc
- multiop $DIR/$tdir/$tfile Owy_wyc &
+ multiop_bg_pause $DIR/$tdir/$tfile Owy_wyc || return 1
MULTI_PID=$!
- usleep 500
ost_evict_client
- usleep 500
kill -USR1 $MULTI_PID
wait $MULTI_PID
rc=$?
sysctl -w lustre.fail_loc=0x0
client_reconnect
- [ $rc -eq 0 ] && error "multiop didn't fail fsync: rc $rc" || true
+ [ $rc -eq 0 ] && error_ignore 5494 "multiop didn't fail fsync: rc $rc" || true
}
run_test 24 "fsync error (should return error)"
-test_26() { # bug 5921 - evict dead exports by pinger
+test_26a() { # was test_26 bug 5921 - evict dead exports by pinger
# this test can only run from a client on a separate node.
- remote_ost || skip "local OST" && return
- remote_mds || skip "local MDS" && return
- OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
- OST_EXP="`do_facet ost1 cat $OST_FILE`"
+ remote_ost || { skip "local OST" && return 0; }
+ remote_mds || { skip "local MDS" && return 0; }
+ OST_FILE=obdfilter.${ost1_svc}.num_exports
+ OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`"
OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2`
echo starting with $OST_NEXP1 OST exports
# OBD_FAIL_PTLRPC_DROP_RPC 0x505
# might have to wait for the next ping.
echo Waiting for $(($TIMEOUT * 4)) secs
sleep $(($TIMEOUT * 4))
- OST_EXP="`do_facet ost1 cat $OST_FILE`"
+ OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`"
OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2`
echo ending with $OST_NEXP2 OST exports
do_facet client sysctl -w lustre.fail_loc=0x0
[ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted"
return 0
}
-run_test 26 "evict dead exports"
+run_test 26a "evict dead exports"
test_26b() { # bug 10140 - evict dead exports by pinger
client_df
zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
- MDS_FILE=$LPROC/mdt/${mds1_svc}/num_exports
- MDS_NEXP1="`do_facet $SINGLEMDS cat $MDS_FILE | cut -d' ' -f2`"
- OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports
- OST_NEXP1="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
+ sleep 1 # wait connections being established
+ MDS_FILE=mdt.${mds1_svc}.num_exports
+ MDS_NEXP1="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`"
+ OST_FILE=obdfilter.${ost1_svc}.num_exports
+ OST_NEXP1="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`"
echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
zconf_umount `hostname` $MOUNT2 -f
# evictor takes up to 2.25x to evict. But if there's a
# might have to wait for the next ping.
echo Waiting for $(($TIMEOUT * 4)) secs
sleep $(($TIMEOUT * 4))
- OST_NEXP2="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`"
- MDS_NEXP2="`do_facet $SINGLEMDS cat $MDS_FILE | cut -d' ' -f2`"
+ OST_NEXP2="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`"
+ MDS_NEXP2="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`"
echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
[ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
[ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
rc=$?
echo writemany returned $rc
#these may fail because of eviction due to slow AST response.
- return $rc
+ [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true
}
run_test 50 "failover MDS under load"
wait $CLIENT_PID
rc=$?
echo writemany returned $rc
- return $rc
+ [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true
}
run_test 51 "failover MDS during recovery"
# test of open reconstruct
test_53() {
touch $DIR/$tfile
- drop_ldlm_reply "./openfile -f O_RDWR:O_CREAT -m 0755 $DIR/$tfile" ||\
+ drop_ldlm_reply "openfile -f O_RDWR:O_CREAT -m 0755 $DIR/$tfile" ||\
return 2
}
run_test 53 "touch: drop rep"
test_57_helper() {
# no oscs means no client or mdt
- while [ -e $LPROC/osc ]; do
- for f in `find $LPROC -type f`; do
- cat $f > /dev/null 2>&1
- done
+ while lctl get_param osc.*.* > /dev/null 2>&1; do
+ : # loop until proc file is removed
done
}
}
run_test 58 "Eviction in the middle of open RPC reply processing"
+test_59() { # bug 10589
+ zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2"
+ echo $DIR2 | grep -q $MOUNT2 || error "DIR2 is not set properly: $DIR2"
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311
+ sysctl -w lustre.fail_loc=0x311
+ writes=$(LANG=C dd if=/dev/zero of=$DIR2/$tfile count=1 2>&1)
+ [ $? = 0 ] || error "dd write failed"
+ writes=$(echo $writes | awk -F '+' '/out/ {print $1}')
+ sysctl -w lustre.fail_loc=0
+ sync
+ zconf_umount `hostname` $MOUNT2 -f
+ reads=$(LANG=C dd if=$DIR/$tfile of=/dev/null 2>&1)
+ [ $? = 0 ] || error "dd read failed"
+ reads=$(echo $reads | awk -F '+' '/in/ {print $1}')
+ [ "$reads" -eq "$writes" ] || error "read" $reads "blocks, must be" $writes
+}
+run_test 59 "Read cancel race on client eviction"
+
equals_msg `basename $0`: test complete, cleaning up
check_and_cleanup_lustre
[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true