run_test 0 "empty replay"
test_0b() {
- # this test attempts to trigger a race in the precreation code,
+ # this test attempts to trigger a race in the precreation code,
# and must run before any other objects are created on the filesystem
fail ost1
createmany -o $DIR/$tfile 20 || return 1
{
local mds=$1
local width=$2
- local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width`
+ local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width`
echo $width > $file
}
seq_get_width()
{
local mds=$1
- local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width`
+ local file=`ls /proc/fs/lustre/seq/cli-srv-$mds-mdc-*/width`
cat $file
}
# (1) fld_create replay should happen;
#
# (2) fld_create replay should not return -EEXISTS, if it does
-# this means sequence manager recovery code is buggy and allocated
+# this means sequence manager recovery code is buggy and allocated
# same sequence two times after recovery.
#
# multi-mds
# ---------
-# (1) fld_create replay may not happen, because its home MDS is
+# (1) fld_create replay may not happen, because its home MDS is
# MDS2 which is not involved to revovery;
#
-# (2) as fld_create does not happen on MDS1, it does not make any
+# (2) as fld_create does not happen on MDS1, it does not make any
# problem.
test_0c() {
local label=`mdsdevlabel 1`
replay_barrier $SINGLEMDS
local sw=`seq_get_width $label`
-
- # make seq manager switch to next sequence each
+
+ # make seq manager switch to next sequence each
# time as new fid is needed.
seq_set_width $label 1
-
- # make sure that fld has created at least one new
+
+ # make sure that fld has created at least one new
# entry on server
touch $DIR/$tfile || return 2
seq_set_width $label $sw
-
+
# fail $SINGLEMDS and start recovery, replay RPCs, etc.
fail $SINGLEMDS
-
+
# wait for recovery finish
sleep 10
df $MOUNT
-
- # flush fld cache and dentry cache to make it lookup
+
+ # flush fld cache and dentry cache to make it lookup
# created entry instead of revalidating existent one
umount $MOUNT
zconf_mount `hostname` $MOUNT
-
- # issue lookup which should call fld lookup which
- # should fail if client did not replay fld create
+
+ # issue lookup which should call fld lookup which
+ # should fail if client did not replay fld create
# correctly and server has no fld entry
touch $DIR/$tfile || return 3
rm $DIR/$tfile || return 4
replay_barrier $SINGLEMDS
for i in `seq 10`; do
echo "tag-$i" > $DIR/$tfile-$i
- done
+ done
fail $SINGLEMDS
for i in `seq 10`; do
grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
- done
+ done
}
run_test 4 "|x| 10 open(O_CREAT)s"
}
run_test 4b "|x| rm 10 files"
-# The idea is to get past the first block of precreated files on both
+# The idea is to get past the first block of precreated files on both
# osts, and then replay.
test_5() {
replay_barrier $SINGLEMDS
for i in `seq 220`; do
echo "tag-$i" > $DIR/$tfile-$i
- done
+ done
fail $SINGLEMDS
for i in `seq 220`; do
grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
- done
+ done
rm -rf $DIR/$tfile-*
sleep 3
# waiting for commitment of removal
replay_barrier $SINGLEMDS
rm -rf $DIR/$tdir
fail $SINGLEMDS
- $CHECKSTAT -t dir $DIR/$tdir && return 1 || true
+ $CHECKSTAT -t dir $DIR/$tdir && return 1 || true
}
run_test 6b "|X| rmdir"
mv $DIR/$tfile $DIR/$tfile-2
replay_barrier $SINGLEMDS
echo "new" > $DIR/$tfile
- grep new $DIR/$tfile
+ grep new $DIR/$tfile
grep old $DIR/$tfile-2
fail $SINGLEMDS
grep new $DIR/$tfile || return 1
run_test 11 "create open write rename |X| create-old-name read"
test_12() {
- mcreate $DIR/$tfile
+ mcreate $DIR/$tfile
multiop $DIR/$tfile o_tSc &
pid=$!
# give multiop a chance to open
# 1777 - replay open after committed chmod that would make
-# a regular open a failure
+# a regular open a failure
test_13() {
- mcreate $DIR/$tfile
+ mcreate $DIR/$tfile
multiop $DIR/$tfile O_wc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
chmod 0 $DIR/$tfile
$CHECKSTAT -p 0 $DIR/$tfile
replay_barrier $SINGLEMDS
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile
replay_barrier $SINGLEMDS
kill -USR1 $pid || return 1
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile
replay_barrier $SINGLEMDS
touch $DIR/g11 || return 1
multiop $DIR/$tfile O_c &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
fail $SINGLEMDS
kill -USR1 $pid || return 1
wait $pid || return 2
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile
touch $DIR/$tfile-2 || return 1
echo "pid: $pid will close"
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile
fail $SINGLEMDS
df -P $DIR || df -P $DIR || true # reconnect
wait_mds_recovery_done || error "MDS recovery not done"
- # FIXME just because recovery is done doesn't mean we've finished
+ # FIXME just because recovery is done doesn't mean we've finished
# orphan cleanup. Fake it with a sleep for now...
sleep 10
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile
touch $DIR/g11 || return 1
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
replay_barrier $SINGLEMDS
rm -f $DIR/$tfile
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
replay_barrier $SINGLEMDS
rm -f $DIR/$tfile
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
replay_barrier $SINGLEMDS
fail $SINGLEMDS
multiop $DIR/$tfile O_tSc &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile
replay_barrier $SINGLEMDS
multiop $DIR/$tfile-2 O_tSc &
pid2=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile-1
rm -f $DIR/$tfile-2
kill -USR1 $pid2
multiop $DIR/$tfile-2 O_tSc &
pid2=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile-1
rm -f $DIR/$tfile-2
multiop $DIR/$tfile-2 O_tSc &
pid2=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
replay_barrier $SINGLEMDS
rm -f $DIR/$tfile-1
rm -f $DIR/$tfile-2
multiop $DIR/$tfile-2 O_tSc &
pid2=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
replay_barrier $SINGLEMDS
rm -f $DIR/$tfile-1
rm -f $DIR/$tfile-2
multiop $DIR/$tfile-2 O_tSc &
pid2=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile-1
rm -f $DIR/$tfile-2
multiop $DIR/$tfile-2 O_tSc &
pid2=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile-1
replay_barrier $SINGLEMDS
createmany -o $DIR/$tfile-%d 100
fail_abort $SINGLEMDS
# this file should be gone, because the replay was aborted
- $CHECKSTAT -t file $DIR/$tfile-* && return 3
+ $CHECKSTAT -t file $DIR/$tfile-* && return 3
unlinkmany $DIR/$tfile-%d 0 100
return 0
}
run_test 33 "abort recovery before client does replay"
-# Stale FID sequence
+# Stale FID sequence
test_33a() {
replay_barrier $SINGLEMDS
createmany -o $DIR/$tfile-%d 10
multiop $DIR/$tfile O_c &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rm -f $DIR/$tfile
replay_barrier $SINGLEMDS
}
run_test 34 "abort recovery before client does replay (test mds_cleanup_orphans)"
-# bug 2278 - generate one orphan on OST, then destroy it during recovery from llog
+# bug 2278 - generate one orphan on OST, then destroy it during recovery from llog
test_35() {
touch $DIR/$tfile
checkstat $DIR/$tfile
facet_failover $SINGLEMDS
cancel_lru_locks mdc
- if dmesg | grep "unknown lock cookie"; then
+ if dmesg | grep "unknown lock cookie"; then
echo "cancel after replay failed"
return 1
fi
multiop $DIR/$tfile dD_c &
pid=$!
# give multiop a chance to open
- sleep 1
+ sleep 1
rmdir $DIR/$tfile
replay_barrier $SINGLEMDS
#b=2477,2532
test_40(){
- $LCTL mark multiop $MOUNT/$tfile OS_c
+ $LCTL mark multiop $MOUNT/$tfile OS_c
multiop $MOUNT/$tfile OS_c &
PID=$!
writeme -s $MOUNT/${tfile}-2 &
sleep $TIMEOUT
stat2=`count_ost_writes`
echo "$stat1, $stat2"
- if [ $stat1 -lt $stat2 ]; then
+ if [ $stat1 -lt $stat2 ]; then
echo "writes continuing during recovery"
RC=0
else
fi
echo "waiting for writeme $WRITE_PID"
kill $WRITE_PID
- wait $WRITE_PID
+ wait $WRITE_PID
echo "waiting for multiop $PID"
wait $PID || return 2
local f=$MOUNT/$tfile
# make sure the start of the file is ost1
- lfs setstripe $f -s $((128 * 1024)) -i 0
+ lfs setstripe $f -s $((128 * 1024)) -i 0
do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3
cancel_lru_locks osc
# fail ost2 and read from ost1
debugsave
sysctl -w lnet.debug=-1
facet_failover ost1
-
+
# osc is evicted, fs is smaller (but only with failout OSTs (bug 7287)
#blocks_after=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'`
#[ $blocks_after -lt $blocks ] || return 1
pid=$!
sleep 1
- # This will cause the CLOSE to fail before even
+ # This will cause the CLOSE to fail before even
# allocating a reply buffer
$LCTL --device $mdcdev deactivate || return 4
run_test 46 "Don't leak file handle after open resend (3325)"
test_47() { # bug 2824
- # create some files to make sure precreate has been done on all
+ # create some files to make sure precreate has been done on all
# OSTs. (just in case this test is run independently)
createmany -o $DIR/$tfile 20 || return 1
df $MOUNT || return 2
# let the MDS discover the OST failure, attempt to recover, fail
- # and recover again.
+ # and recover again.
sleep $((3 * TIMEOUT))
- # Without 2824, this createmany would hang
+ # Without 2824, this createmany would hang
createmany -o $DIR/$tfile 20 || return 3
unlinkmany $DIR/$tfile 20 || return 4
}
run_test 52 "time out lock replay (3764)"
-#b_cray 53 "|X| open request and close reply while two MDC requests in flight"
+# bug 3462 - simultaneous MDC requests
+test_53a() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+ # give multiop a change to open
+ sleep 1
+
+ #define OBD_FAIL_MDS_CLOSE_NET 0x115
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000115"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+
+ mcreate $DIR/${tdir}-2/f || return 1
+
+ # close should still be here
+ [ -d /proc/$close_pid ] || return 2
+
+ replay_barrier_nodf $SINGLEMDS
+ fail $SINGLEMDS
+ wait $close_pid || return 3
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53a "|X| close request while two MDC requests in flight"
+
+test_53b() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+
+ #define OBD_FAIL_MDS_REINT_NET 0x107
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107"
+ mcreate $DIR/${tdir}-2/f &
+ open_pid=$!
+ sleep 1
+
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+ wait $close_pid || return 1
+ # open should still be here
+ [ -d /proc/$open_pid ] || return 2
+
+ replay_barrier_nodf $SINGLEMDS
+ fail $SINGLEMDS
+ wait $open_pid || return 3
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53b "|X| open request while two MDC requests in flight"
+
+test_53c() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+
+ #define OBD_FAIL_MDS_REINT_NET 0x107
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107"
+ mcreate $DIR/${tdir}-2/f &
+ open_pid=$!
+ sleep 1
+
+ #define OBD_FAIL_MDS_CLOSE_NET 0x115
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000115"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+
+ replay_barrier_nodf $SINGLEMDS
+ fail_nodf $SINGLEMDS
+ wait $open_pid || return 1
+ sleep 2
+ # close should be gone
+ [ -d /proc/$close_pid ] && return 2
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53c "|X| open request and close request while two MDC requests in flight"
+
+test_53d() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+ # give multiop a chance to open
+ sleep 1
+
+ #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x8000013f"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+ mcreate $DIR/${tdir}-2/f || return 1
+
+ # close should still be here
+ [ -d /proc/$close_pid ] || return 2
+ fail $SINGLEMDS
+ wait $close_pid || return 3
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53d "|X| close reply while two MDC requests in flight"
+
+test_53e() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+
+ #define OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x119"
+ mcreate $DIR/${tdir}-2/f &
+ open_pid=$!
+ sleep 1
+
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+ wait $close_pid || return 1
+ # open should still be here
+ [ -d /proc/$open_pid ] || return 2
+
+ replay_barrier_nodf $SINGLEMDS
+ fail $SINGLEMDS
+ wait $open_pid || return 3
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53e "|X| open reply while two MDC requests in flight"
+
+test_53f() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+
+ #define OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x119"
+ mcreate $DIR/${tdir}-2/f &
+ open_pid=$!
+ sleep 1
+
+ #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x8000013f"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+
+ replay_barrier_nodf $SINGLEMDS
+ fail_nodf $SINGLEMDS
+ wait $open_pid || return 1
+ sleep 2
+ # close should be gone
+ [ -d /proc/$close_pid ] && return 2
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53f "|X| open reply and close reply while two MDC requests in flight"
+
+test_53g() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+
+ #define OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x119"
+ mcreate $DIR/${tdir}-2/f &
+ open_pid=$!
+ sleep 1
+
+ #define OBD_FAIL_MDS_CLOSE_NET 0x115
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000115"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+ replay_barrier_nodf $SINGLEMDS
+ fail_nodf $SINGLEMDS
+ wait $open_pid || return 1
+ sleep 2
+ # close should be gone
+ [ -d /proc/$close_pid ] && return 2
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53g "|X| drop open reply and close request while close and open are both in flight"
+
+test_53h() {
+ mkdir -p $DIR/${tdir}-1
+ mkdir -p $DIR/${tdir}-2
+ multiop $DIR/${tdir}-1/f O_c &
+ close_pid=$!
+
+ #define OBD_FAIL_MDS_REINT_NET 0x107
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x80000107"
+ mcreate $DIR/${tdir}-2/f &
+ open_pid=$!
+ sleep 1
+
+ #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0x8000013f"
+ kill -USR1 $close_pid
+ cancel_lru_locks mdc # force the close
+ sleep 1
+
+ replay_barrier_nodf $SINGLEMDS
+ fail_nodf $SINGLEMDS
+ wait $open_pid || return 1
+ sleep 2
+ # close should be gone
+ [ -d /proc/$close_pid ] && return 2
+ do_facet $SINGLEMDS "sysctl -w lustre.fail_loc=0"
+
+ $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
+ $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
+ rm -rf $DIR/${tdir}-*
+}
+run_test 53h "|X| open request and close reply while two MDC requests in flight"
+
#b_cray 54 "|X| open request and close reply while two MDC requests in flight"
#b3761 ASSERTION(hash != 0) failed
fail $SINGLEMDS
unlinkmany $DIR/$tdir/$tfile-%d 100 100
local no_ctxt=`dmesg | grep "No ctxt"`
- [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
+ [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
}
run_test 60 "test llog post recovery init vs llog unlink"