# This test needs to be run on the client
#
SAVE_PWD=$PWD
+export MULTIOP=${MULTIOP:-multiop}
LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
SETUP=${SETUP:-}
CLEANUP=${CLEANUP:-}
. $LUSTRE/tests/test-framework.sh
init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
CHECK_GRANT=${CHECK_GRANT:-"yes"}
GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
-remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
# Skip these tests
-# bug number: 17466 15962
-ALWAYS_EXCEPT="61d $REPLAY_SINGLE_EXCEPT"
-
-if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
- CONFIG_EXCEPTIONS="0b 42 47 61a 61c"
- echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
- echo "Except the tests: $CONFIG_EXCEPTIONS"
- ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
-fi
+# bug number: 17466 18857 LU-1867 LU-1473
+ALWAYS_EXCEPT="61d 33a 33b 89 62 $REPLAY_SINGLE_EXCEPT"
+
+[ $(facet_fstype $SINGLEMDS) = "zfs" ] &&
+# bug number for skipped test: LU-951
+ ALWAYS_EXCEPT="$ALWAYS_EXCEPT 73a"
# 63 min 7 min AT AT AT AT"
[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68"
+[ $(facet_fstype $SINGLEMDS) = "zfs" ] &&
+# bug number for skipped test: LU-3127
+ ALWAYS_EXCEPT="$ALWAYS_EXCEPT 73b"
+
build_test_filter
check_and_setup_lustre
mkdir -p $DIR
assert_DIR
-rm -rf $DIR/[df][0-9]*
+rm -rf $DIR/[df][0-9]* $DIR/f.$TESTSUITE.*
+
+# LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
+if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
+ sync
+ do_facet $SINGLEMDS sync
+fi
test_0a() { # was test_0
- sleep 10
mkdir $DIR/$tfile
replay_barrier $SINGLEMDS
fail $SINGLEMDS
}
run_test 0b "ensure object created after recover exists. (3284)"
-seq_set_width()
-{
- local mds=$1
- local width=$2
- lctl set_param -n seq.cli-srv-$mds-mdc-*.width=$width
-}
-
-seq_get_width()
-{
- local mds=$1
- lctl get_param -n seq.cli-srv-$mds-mdc-*.width
-}
-
-# This test should pass for single-mds and multi-mds configs.
-# But for different configurations it tests different things.
-#
-# single-mds
-# ----------
-# (1) fld_create replay should happen;
-#
-# (2) fld_create replay should not return -EEXISTS, if it does
-# this means sequence manager recovery code is buggy and allocated
-# same sequence two times after recovery.
-#
-# multi-mds
-# ---------
-# (1) fld_create replay may not happen, because its home MDS is
-# MDS2 which is not involved to revovery;
-#
-# (2) as fld_create does not happen on MDS1, it does not make any
-# problem.
test_0c() {
- local label=`mdsdevlabel 1`
- [ -z "$label" ] && echo "No label for mds1" && return 1
-
+ replay_barrier $SINGLEMDS
+ mcreate $DIR/$tfile
+ umount $MOUNT
+ facet_failover $SINGLEMDS
+ zconf_mount `hostname` $MOUNT || error "mount fails"
+ client_up || error "post-failover df failed"
+ # file shouldn't exist if replay-barrier works as expected
+ rm $DIR/$tfile && error "File exists and it shouldn't"
+ return 0
+}
+run_test 0c "check replay-barrier"
+
+test_0d() {
replay_barrier $SINGLEMDS
- local sw=`seq_get_width $label`
-
- # make seq manager switch to next sequence each
- # time as new fid is needed.
- seq_set_width $label 1
-
- # make sure that fld has created at least one new
- # entry on server
- touch $DIR/$tfile || return 2
- seq_set_width $label $sw
-
- # fail $SINGLEMDS and start recovery, replay RPCs, etc.
- fail $SINGLEMDS
-
- # wait for recovery finish
- sleep 10
- df $MOUNT
-
- # flush fld cache and dentry cache to make it lookup
- # created entry instead of revalidating existent one
umount $MOUNT
- zconf_mount `hostname` $MOUNT
-
- # issue lookup which should call fld lookup which
- # should fail if client did not replay fld create
- # correctly and server has no fld entry
- touch $DIR/$tfile || return 3
- rm $DIR/$tfile || return 4
+ facet_failover $SINGLEMDS
+ zconf_mount `hostname` $MOUNT || error "mount fails"
+ client_up || error "post-failover df failed"
}
-run_test 0c "fld create"
+run_test 0d "expired recovery with no clients"
test_1() {
replay_barrier $SINGLEMDS
wait $pid || return 1
$CHECKSTAT -s 1 -p 0 $DIR/$tfile || return 2
+ rm $DIR/$tfile || return 4
return 0
}
run_test 13 "open chmod 0 |x| write close"
usleep 60 # give dd a chance to start
done
- lfs getstripe $DIR/$tfile || return 1
+ $GETSTRIPE $DIR/$tfile || return 1
rm -f $DIR/$tfile || return 2 # make it an orphan
mds_evict_client
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
fail $SINGLEMDS # start orphan recovery
- df -P $DIR || df -P $DIR || true # reconnect
wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
-
- # just because recovery is done doesn't mean we've finished
- # orphan cleanup. Wait for llogs to get synchronized.
- echo waiting for orphan cleanup...
- while [ true ]; do
- local -a sync=($(do_facet ost "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}'))
- local con=1
- for ((i=0; i<${#sync[@]}; i++)); do
- [ ${sync[$i]} -eq 0 ] && continue
- # there is a not finished MDS-OST synchronization
- con=0
- break;
- done
- [ ${con} -eq 1 ] && break
- sleep 1
- done
-
- # let the statfs cache to get old enough.
- sleep 1
-
+ wait_mds_ost_sync || return 3
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
log "before $BEFOREUSED, after $AFTERUSED"
- [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
+ (( $AFTERUSED > $BEFOREUSED + $(fs_log_size) )) &&
error "after $AFTERUSED > before $BEFOREUSED"
return 0
}
ls -la $DIR/$tfile
mds_evict_client
-
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
kill -USR1 $pid
- test -s $DIR/$tfile || error "File was truncated"
-
wait $pid || return 1
+ [ -s $DIR/$tfile ] || error "File was truncated"
+
return 0
}
run_test 20c "check that client eviction does not affect file content"
multiop_bg_pause $DIR/$tfile O_c || return 3
pid2=$!
mds_evict_client
- df $MOUNT || sleep 1 && df $MOUNT || return 1
+ client_up || client_up || return 1
kill -USR1 $pid1
kill -USR1 $pid2
wait $pid1 || return 4
}
run_test 32 "close() notices client eviction; close() after client eviction"
-# Abort recovery before client complete
-test_33a() { # was test_33
- replay_barrier $SINGLEMDS
- createmany -o $DIR/$tfile-%d 100
+test_33a() {
+ createmany -o $DIR/$tfile-%d 10
+ replay_barrier_nosync $SINGLEMDS
fail_abort $SINGLEMDS
- # this file should be gone, because the replay was aborted
- $CHECKSTAT -t file $DIR/$tfile-* && return 3
- unlinkmany $DIR/$tfile-%d 0 100
+ # recreate shouldn't fail
+ createmany -o $DIR/$tfile--%d 10 || return 1
+ rm $DIR/$tfile-* -f
return 0
}
-run_test 33a "abort recovery before client does replay"
+run_test 33a "fid seq shouldn't be reused after abort recovery"
+
+test_33b() {
+ #define OBD_FAIL_SEQ_ALLOC 0x1311
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x1311"
-# Stale FID sequence bug 15962
-test_33b() { # was test_33a
- replay_barrier $SINGLEMDS
createmany -o $DIR/$tfile-%d 10
+ replay_barrier_nosync $SINGLEMDS
fail_abort $SINGLEMDS
- unlinkmany $DIR/$tfile-%d 0 10
# recreate shouldn't fail
- createmany -o $DIR/$tfile-%d 10 || return 3
- unlinkmany $DIR/$tfile-%d 0 10
+ createmany -o $DIR/$tfile--%d 10 || return 1
+ rm $DIR/$tfile-* -f
return 0
}
-run_test 33b "fid shouldn't be reused after abort recovery"
+run_test 33b "test fid seq allocation"
test_34() {
multiop_bg_pause $DIR/$tfile O_c || return 2
replay_barrier $SINGLEMDS
# clear the dmesg buffer so we only see errors from this recovery
- dmesg -c >/dev/null
+ do_facet $SINGLEMDS dmesg -c >/dev/null
fail_abort $SINGLEMDS
kill -USR1 $pid
- dmesg | grep "mds_unlink_orphan.*error .* unlinking orphan" && return 1
+ do_facet $SINGLEMDS dmesg | grep "error .* unlinking .* from PENDING" &&
+ return 1
wait $pid || return 3
sync
return 0
}
+start_full_debug_logging
run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)"
+stop_full_debug_logging
test_38() {
createmany -o $DIR/$tfile-%d 800
#b=2477,2532
test_40(){
- $LCTL mark multiop $MOUNT/$tfile OS_c
- multiop $MOUNT/$tfile OS_c &
- PID=$!
- writeme -s $MOUNT/${tfile}-2 &
- WRITE_PID=$!
- sleep 1
- facet_failover $SINGLEMDS
-#define OBD_FAIL_MDS_CONNECT_NET 0x117
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000117"
- kill -USR1 $PID
- stat1=`count_ost_writes`
- sleep $TIMEOUT
- stat2=`count_ost_writes`
- echo "$stat1, $stat2"
- if [ $stat1 -lt $stat2 ]; then
- echo "writes continuing during recovery"
- RC=0
- else
- echo "writes not continuing during recovery, bug 2477"
- RC=4
- fi
- echo "waiting for writeme $WRITE_PID"
- kill $WRITE_PID
- wait $WRITE_PID
+ # always need connection to MDS to verify layout during IO. LU-2628.
+ lctl get_param mdc.*.connect_flags | grep -q layout_lock &&
+ skip "layout_lock needs MDS connection for IO" && return 0
- echo "waiting for multiop $PID"
- wait $PID || return 2
- do_facet client munlink $MOUNT/$tfile || return 3
- do_facet client munlink $MOUNT/${tfile}-2 || return 3
- return $RC
+ $LCTL mark multiop $MOUNT/$tfile OS_c
+ multiop $MOUNT/$tfile OS_c &
+ PID=$!
+ writeme -s $MOUNT/${tfile}-2 &
+ WRITE_PID=$!
+ sleep 1
+ facet_failover $SINGLEMDS
+#define OBD_FAIL_MDS_CONNECT_NET 0x117
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000117"
+ kill -USR1 $PID
+ stat1=`count_ost_writes`
+ sleep $TIMEOUT
+ stat2=`count_ost_writes`
+ echo "$stat1, $stat2"
+ if [ $stat1 -lt $stat2 ]; then
+ echo "writes continuing during recovery"
+ RC=0
+ else
+ echo "writes not continuing during recovery, bug 2477"
+ RC=4
+ fi
+ echo "waiting for writeme $WRITE_PID"
+ kill $WRITE_PID
+ wait $WRITE_PID
+
+ echo "waiting for multiop $PID"
+ wait $PID || return 2
+ do_facet client munlink $MOUNT/$tfile || return 3
+ do_facet client munlink $MOUNT/${tfile}-2 || return 3
+ return $RC
}
run_test 40 "cause recovery in ptlrpc, ensure IO continues"
# the page, guarnateeing that the unlock from the RPC completion would
# assert on trying to unlock the unlocked page.
test_41() {
- [ $OSTCOUNT -lt 2 ] && \
- skip "skipping test 41: we don't have a second OST to test with" && \
- return
+ [ $OSTCOUNT -lt 2 ] &&
+ skip_env "skipping test 41: we don't have a second OST to test with" &&
+ return
local f=$MOUNT/$tfile
# make sure the start of the file is ost1
- lfs setstripe $f -s $((128 * 1024)) -i 0
+ $SETSTRIPE -S $((128 * 1024)) -i 0 $f
do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3
cancel_lru_locks osc
# fail ost2 and read from ost1
- local osc2dev=`do_facet $SINGLEMDS "lctl get_param -n devices | grep ${ost2_svc}-osc-MDT0000" | awk '{print $1}'`
- [ -z "$osc2dev" ] && echo "OST: $ost2_svc" && lctl get_param -n devices && return 4
+ local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $ost2_svc)
+ local osc2dev=$(do_facet $SINGLEMDS "lctl get_param -n devices" | \
+ grep $mdtosc | awk '{print $1}')
+ [ -z "$osc2dev" ] && echo "OST: $ost2_svc" && lctl get_param -n devices &&
+ return 4
do_facet $SINGLEMDS $LCTL --device $osc2dev deactivate || return 1
do_facet client dd if=$f of=/dev/null bs=4k count=1 || return 3
do_facet $SINGLEMDS $LCTL --device $osc2dev activate || return 2
}
run_test 43 "mds osc import failure during recovery; don't LBUG"
-test_44a() { # was test_44
- local at_max_saved=0
-
- mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
-
- # adaptive timeouts slow this way down
- if at_is_enabled; then
- at_max_saved=$(at_max_get mds)
- at_max_set 40 mds
- fi
-
- for i in `seq 1 10`; do
- echo "$i of 10 ($(date +%s))"
- do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
- #define OBD_FAIL_TGT_CONN_RACE 0x701
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
- $LCTL --device $mdcdev recover
- df $MOUNT
- done
- do_facet $SINGLEMDS "lctl set_param fail_loc=0"
- [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds
- return 0
+test_44a() { # was test_44
+ local at_max_saved=0
+
+ local mdcdev=$($LCTL get_param -n devices |
+ awk "/ ${FSNAME}-MDT0000-mdc-/ {print \$1}")
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] ||
+ { echo mdcdev=$mdcdev; $LCTL dl; return 3; }
+
+ # adaptive timeouts slow this way down
+ if at_is_enabled; then
+ at_max_saved=$(at_max_get mds)
+ at_max_set 40 mds
+ fi
+
+ for i in `seq 1 10`; do
+ echo "$i of 10 ($(date +%s))"
+ do_facet $SINGLEMDS \
+ "lctl get_param -n md[ts].*.mdt.timeouts | grep service"
+#define OBD_FAIL_TGT_CONN_RACE 0x701
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
+ # lctl below may fail, it is valid case
+ $LCTL --device $mdcdev recover
+ df $MOUNT
+ done
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0"
+ [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds
+ return 0
}
run_test 44a "race in target handle connect"
test_44b() {
- mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
- for i in `seq 1 10`; do
- echo "$i of 10 ($(date +%s))"
- do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
- #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
- $LCTL --device $mdcdev recover
- df $MOUNT
- done
- do_facet $SINGLEMDS "lctl set_param fail_loc=0"
- return 0
+ local mdcdev=$($LCTL get_param -n devices |
+ awk "/ ${FSNAME}-MDT0000-mdc-/ {print \$1}")
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] ||
+ { echo mdcdev=$mdcdev; $LCTL dl; return 3; }
+
+ for i in `seq 1 10`; do
+ echo "$i of 10 ($(date +%s))"
+ do_facet $SINGLEMDS \
+ "lctl get_param -n md[ts].*.mdt.timeouts | grep service"
+ #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
+ # lctl below may fail, it is valid case
+ $LCTL --device $mdcdev recover
+ df $MOUNT
+ done
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0"
+ return 0
}
run_test 44b "race in target handle connect"
+test_44c() {
+ replay_barrier $SINGLEMDS
+ createmany -m $DIR/$tfile-%d 100 || error "failed to create directories"
+#define OBD_FAIL_TGT_RCVG_FLAG 0x712
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000712"
+ fail_abort $SINGLEMDS
+ unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail abort"
+ fail $SINGLEMDS
+ unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail"
+ return 0
+}
+run_test 44c "race in target handle connect"
+
# Handle failed close
test_45() {
- mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
- $LCTL --device $mdcdev recover
+ local mdcdev=$($LCTL get_param -n devices |
+ awk "/ ${FSNAME}-MDT0000-mdc-/ {print \$1}")
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] ||
+ { echo mdcdev=$mdcdev; $LCTL dl; return 3; }
- multiop_bg_pause $DIR/$tfile O_c || return 1
- pid=$!
+ $LCTL --device $mdcdev recover || return 6
- # This will cause the CLOSE to fail before even
- # allocating a reply buffer
- $LCTL --device $mdcdev deactivate || return 4
+ multiop_bg_pause $DIR/$tfile O_c || return 1
+ pid=$!
- # try the close
- kill -USR1 $pid
- wait $pid || return 1
+ # This will cause the CLOSE to fail before even
+ # allocating a reply buffer
+ $LCTL --device $mdcdev deactivate || return 4
- $LCTL --device $mdcdev activate || return 5
- sleep 1
+ # try the close
+ kill -USR1 $pid
+ wait $pid || return 1
- $CHECKSTAT -t file $DIR/$tfile || return 2
- return 0
+ $LCTL --device $mdcdev activate || return 5
+ sleep 1
+
+ $CHECKSTAT -t file $DIR/$tfile || return 2
+ return 0
}
run_test 45 "Handle failed close"
# OBD_FAIL_OST_CREATE_NET 0x204
fail ost1
do_facet ost1 "lctl set_param fail_loc=0x80000204"
- df $MOUNT || return 2
+ client_up || return 2
# let the MDS discover the OST failure, attempt to recover, fail
# and recover again.
test_48() {
remote_ost_nodsh && skip "remote OST with nodsh" && return 0
- [ "$OSTCOUNT" -lt "2" ] && skip "$OSTCOUNT < 2 OSTs -- skipping" && return
+ [ "$OSTCOUNT" -lt "2" ] && skip_env "$OSTCOUNT < 2 OSTs -- skipping" && return
replay_barrier $SINGLEMDS
createmany -o $DIR/$tfile 20 || return 1
# OBD_FAIL_OST_EROFS 0x216
facet_failover $SINGLEMDS
do_facet ost1 "lctl set_param fail_loc=0x80000216"
- df $MOUNT || return 2
+ client_up || return 2
createmany -o $DIR/$tfile 20 20 || return 2
unlinkmany $DIR/$tfile 40 || return 3
run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
test_50() {
- local oscdev=`do_facet $SINGLEMDS lctl get_param -n devices | grep ${ost1_svc}-osc-MDT0000 | awk '{print $1}'`
+ local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $ost1_svc)
+ local oscdev=$(do_facet $SINGLEMDS "lctl get_param -n devices" | \
+ grep $mdtosc | awk '{print $1}')
[ "$oscdev" ] || return 1
do_facet $SINGLEMDS $LCTL --device $oscdev recover || return 2
do_facet $SINGLEMDS $LCTL --device $oscdev recover || return 3
# bug 3462 - simultaneous MDC requests
test_53a() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
multiop $DIR/${tdir}-1/f O_c &
run_test 53a "|X| close request while two MDC requests in flight"
test_53b() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
- multiop $DIR/${tdir}-1/f O_c &
+ multiop_bg_pause $DIR/${tdir}-1/f O_c || return 6
close_pid=$!
#define OBD_FAIL_MDS_REINT_NET 0x107
run_test 53b "|X| open request while two MDC requests in flight"
test_53c() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 53c "|X| open request and close request while two MDC requests in flight"
test_53d() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
run_test 53d "|X| close reply while two MDC requests in flight"
test_53e() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
run_test 53e "|X| open reply while two MDC requests in flight"
test_53f() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 53f "|X| open reply and close reply while two MDC requests in flight"
test_53g() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
-
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
+
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 53g "|X| drop open reply and close request while close and open are both in flight"
test_53h() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
cancel_lru_locks mdc # force the close
sleep 1
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 58a "test recovery from llog for setattr op (test llog_gen_rec)"
test_58b() {
+ local orig
+ local new
+
+ large_xattr_enabled &&
+ orig="$(generate_string $(max_xattr_size))" || orig="bar"
+
mount_client $MOUNT2
mkdir -p $DIR/$tdir
touch $DIR/$tdir/$tfile
replay_barrier $SINGLEMDS
- setfattr -n trusted.foo -v bar $DIR/$tdir/$tfile
+ setfattr -n trusted.foo -v $orig $DIR/$tdir/$tfile
fail $SINGLEMDS
- VAL=`getfattr --absolute-names --only-value -n trusted.foo $MOUNT2/$tdir/$tfile`
- [ x$VAL = x"bar" ] || return 1
+ new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile)
+ [[ "$new" = "$orig" ]] || return 1
rm -f $DIR/$tdir/$tfile
rmdir $DIR/$tdir
zconf_umount `hostname` $MOUNT2
run_test 58b "test replay of setxattr op"
test_58c() { # bug 16570
- mount_client $MOUNT2
- mkdir -p $DIR/$tdir
- touch $DIR/$tdir/$tfile
- drop_request "setfattr -n trusted.foo -v bar $DIR/$tdir/$tfile" || \
- return 1
- VAL=`getfattr --absolute-names --only-value -n trusted.foo $MOUNT2/$tdir/$tfile`
- [ x$VAL = x"bar" ] || return 2
- drop_reint_reply "setfattr -n trusted.foo1 -v bar1 $DIR/$tdir/$tfile" || \
- return 3
- VAL=`getfattr --absolute-names --only-value -n trusted.foo1 $MOUNT2/$tdir/$tfile`
- [ x$VAL = x"bar1" ] || return 4
- rm -f $DIR/$tdir/$tfile
- rmdir $DIR/$tdir
- zconf_umount `hostname` $MOUNT2
+ local orig
+ local orig1
+ local new
+
+ if large_xattr_enabled; then
+ local xattr_size=$(max_xattr_size)
+ orig="$(generate_string $((xattr_size / 2)))"
+ orig1="$(generate_string $xattr_size)"
+ else
+ orig="bar"
+ orig1="bar1"
+ fi
+
+ mount_client $MOUNT2
+ mkdir -p $DIR/$tdir
+ touch $DIR/$tdir/$tfile
+ drop_request "setfattr -n trusted.foo -v $orig $DIR/$tdir/$tfile" ||
+ return 1
+ new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile)
+ [[ "$new" = "$orig" ]] || return 2
+ drop_reint_reply "setfattr -n trusted.foo1 -v $orig1 $DIR/$tdir/$tfile" ||
+ return 3
+ new=$(get_xattr_value trusted.foo1 $MOUNT2/$tdir/$tfile)
+ [[ "$new" = "$orig1" ]] || return 4
+ rm -f $DIR/$tdir/$tfile
+ rmdir $DIR/$tdir
+ zconf_umount $HOSTNAME $MOUNT2
}
run_test 58c "resend/reconstruct setxattr op"
}
run_test 61c "test race mds llog sync vs llog cleanup"
-test_61d() { # bug 16002 # bug 17466
- shutdown_facet $SINGLEMDS
-#define OBD_FAIL_OBD_LLOG_SETUP 0x605
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x605"
- start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS && error "mds start should have failed"
- do_facet $SINGLEMDS "lctl set_param fail_loc=0"
- start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || error "cannot restart mds"
+test_61d() { # bug 16002 # bug 17466 # bug 22137
+# OBD_FAIL_OBD_LLOG_SETUP 0x605
+ stop mgs
+ do_facet mgs "lctl set_param fail_loc=0x80000605"
+ start mgs $MGSDEV $MGS_MOUNT_OPTS && error "mgs start should have failed"
+ do_facet mgs "lctl set_param fail_loc=0"
+ start mgs $MGSDEV $MGS_MOUNT_OPTS || error "cannot restart mgs"
}
run_test 61d "error in llog_setup should cleanup the llog context correctly"
createmany -o $DIR/$tdir/$tfile- 25
#define OBD_FAIL_TGT_REPLAY_DROP 0x707
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707"
- facet_failover $SINGLEMDS
- df $MOUNT || return 1
+ fail $SINGLEMDS
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
unlinkmany $DIR/$tdir/$tfile- 25 || return 2
return 0
echo "Cleaning up AT ..."
if [ -n "$ATOLDBASE" ]; then
local at_history=$($LCTL get_param -n at_history)
- do_facet mds "lctl set_param at_history=$at_history" || true
+ do_facet $SINGLEMDS "lctl set_param at_history=$at_history" || true
do_facet ost1 "lctl set_param at_history=$at_history" || true
fi
done
if [ -z "$ATOLDBASE" ]; then
- ATOLDBASE=$(do_facet mds "lctl get_param -n at_history")
+ ATOLDBASE=$(do_facet $SINGLEMDS "lctl get_param -n at_history")
# speed up the timebase so we can check decreasing AT
- do_facet mds "lctl set_param at_history=8" || true
+ do_facet $SINGLEMDS "lctl set_param at_history=8" || true
do_facet ost1 "lctl set_param at_history=8" || true
# sleep for a while to cool down, should be > 8s and also allow
at_start || return 0
$LCTL dk > /dev/null
debugsave
- sysctl -w lnet.debug="+other"
+ $LCTL set_param debug="other"
# Slow down a request to the current service time, this is critical
# because previous tests may have caused this value to increase.
REQ_DELAY=`lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts |
awk '/portal 12/ {print $5}'`
REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
- do_facet mds lctl set_param fail_val=$((${REQ_DELAY} * 1000))
+ do_facet $SINGLEMDS lctl set_param fail_val=$((${REQ_DELAY} * 1000))
#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
- do_facet mds sysctl -w lustre.fail_loc=0x8000050a
+ do_facet $SINGLEMDS $LCTL set_param fail_loc=0x8000050a
createmany -o $DIR/$tfile 10 > /dev/null
unlinkmany $DIR/$tfile 10 > /dev/null
# check for log message
at_start || return 0
# turn on D_ADAPTTO
debugsave
- sysctl -w lnet.debug="other trace"
+ $LCTL set_param debug="other trace"
$LCTL dk > /dev/null
# Slow down a request to the current service time, this is critical
# because previous tests may have caused this value to increase.
+ $SETSTRIPE --stripe-index=0 --count=1 $DIR/$tfile
+ multiop $DIR/$tfile Ow1yc
REQ_DELAY=`lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts |
awk '/portal 6/ {print $5}'`
REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
do_facet ost1 lctl set_param fail_val=${REQ_DELAY}
#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
- do_facet ost1 sysctl -w lustre.fail_loc=0x224
+ do_facet ost1 $LCTL set_param fail_loc=0x224
rm -f $DIR/$tfile
- lfs setstripe $DIR/$tfile --index=0 --count=1
+ $SETSTRIPE --stripe-index=0 --count=1 $DIR/$tfile
# force some real bulk transfer
multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
- do_facet ost1 sysctl -w lustre.fail_loc=0
+ do_facet ost1 $LCTL set_param fail_loc=0
# check for log message
$LCTL dk | grep "Early reply #" || error "No early reply"
debugrestore
at_start || return 0
lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
# adjust 5s at a time so no early reply is sent (within deadline)
- do_facet mds "sysctl -w lustre.fail_val=5000"
+ do_facet $SINGLEMDS "$LCTL set_param fail_val=5000"
#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
- do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+ do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x8000050a"
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
- do_facet mds "sysctl -w lustre.fail_val=10000"
- do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
+ do_facet $SINGLEMDS "$LCTL set_param fail_val=10000"
+ do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x8000050a"
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
- do_facet mds "sysctl -w lustre.fail_loc=0"
+ do_facet $SINGLEMDS "$LCTL set_param fail_loc=0"
sleep 9
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
test_66b() #bug 3055
{
- remote_ost_nodsh && skip "remote OST with nodsh" && return 0
-
- at_start || return 0
- ORIG=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $4}')
- sysctl -w lustre.fail_val=$(($ORIG + 5))
-#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
- sysctl -w lustre.fail_loc=0x50c
- ls $DIR/$tfile > /dev/null 2>&1
- sysctl -w lustre.fail_loc=0
- CUR=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $4}')
- WORST=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $6}')
- echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
- [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG"
+ remote_ost_nodsh && skip "remote OST with nodsh" && return 0
+
+ at_start || return 0
+ ORIG=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
+ awk '/network/ {print $4}')
+ $LCTL set_param fail_val=$(($ORIG + 5))
+ #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
+ $LCTL set_param fail_loc=0x50c
+ ls $DIR/$tfile > /dev/null 2>&1
+ $LCTL set_param fail_loc=0
+ CUR=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
+ awk '/network/ {print $4}')
+ WORST=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
+ awk '/network/ {print $6}')
+ echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
+ [ $WORST -gt $ORIG ] ||
+ error "Worst $WORST should be worse than orig $ORIG"
}
run_test 66b "AT: verify net latency adjusts"
at_start || return 0
CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
# sleeping threads may drive values above this
- do_facet ost1 "sysctl -w lustre.fail_val=400"
+ do_facet ost1 "$LCTL set_param fail_val=400"
#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
- do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
+ do_facet ost1 "$LCTL set_param fail_loc=0x50a"
createmany -o $DIR/$tfile 20 > /dev/null
unlinkmany $DIR/$tfile 20 > /dev/null
- do_facet ost1 "sysctl -w lustre.fail_loc=0"
+ do_facet ost1 "$LCTL set_param fail_loc=0"
CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
ATTEMPTS=$(($CONN2 - $CONN1))
echo "$ATTEMPTS osc reconnect attempts on gradual slow"
at_start || return 0
CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
+
+ # exhaust precreations on ost1
+ local OST=$(ostname_from_index 0)
+ local mdtosc=$(get_mdtosc_proc_path mds $OST)
+ local last_id=$(do_facet $SINGLEMDS lctl get_param -n \
+ osc.$mdtosc.prealloc_last_id)
+ local next_id=$(do_facet $SINGLEMDS lctl get_param -n \
+ osc.$mdtosc.prealloc_next_id)
+
+ mkdir -p $DIR/$tdir/${OST}
+ $SETSTRIPE -i 0 -c 1 $DIR/$tdir/${OST} || error "$SETSTRIPE"
+ echo "Creating to objid $last_id on ost $OST..."
#define OBD_FAIL_OST_PAUSE_CREATE 0x223
- do_facet ost1 "sysctl -w lustre.fail_val=20000"
- do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
- cp /etc/profile $DIR/$tfile || error "cp failed"
+ do_facet ost1 "$LCTL set_param fail_val=20000"
+ do_facet ost1 "$LCTL set_param fail_loc=0x80000223"
+ createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2))
+
client_reconnect
do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
log "phase 2"
ATTEMPTS=$(($CONN2 - $CONN1))
echo "$ATTEMPTS osc reconnect attempts on instant slow"
# do it again; should not timeout
- do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
+ do_facet ost1 "$LCTL set_param fail_loc=0x80000223"
cp /etc/profile $DIR/$tfile || error "cp failed"
- do_facet ost1 "sysctl -w lustre.fail_loc=0"
+ do_facet ost1 "$LCTL set_param fail_loc=0"
client_reconnect
do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
rm -rf $DIR/$tdir
mkdir -p $DIR/$tdir
- lfs setstripe $DIR/$tdir --index=0 --count=1
+ $SETSTRIPE --stripe-index=0 --count=1 $DIR/$tdir
#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
- sysctl -w lustre.fail_val=$(($TIMEOUT - 1))
- sysctl -w lustre.fail_loc=0x80000312
+ $LCTL set_param fail_val=$(($TIMEOUT - 1))
+ $LCTL set_param fail_loc=0x80000312
cp /etc/profile $DIR/$tdir/${tfile}_1 || error "1st cp failed $?"
- sysctl -w lustre.fail_val=$((TIMEOUT * 5 / 4))
- sysctl -w lustre.fail_loc=0x80000312
+ $LCTL set_param fail_val=$((TIMEOUT * 5 / 4))
+ $LCTL set_param fail_loc=0x80000312
cp /etc/profile $DIR/$tdir/${tfile}_2 || error "2nd cp failed $?"
- sysctl -w lustre.fail_loc=0
+ $LCTL set_param fail_loc=0
echo $ENQ_MIN >> $ldlm_enqueue_min
do_facet ost1 "echo $ENQ_MIN_R >> $ldlm_enqueue_min_r"
{ skip "Need two or more clients, have $CLIENTCOUNT" && return; }
echo "mount clients $CLIENTS ..."
- zconf_mount_clients $CLIENTS $DIR
+ zconf_mount_clients $CLIENTS $MOUNT
local clients=${CLIENTS//,/ }
echo "Write/read files on $DIR ; clients $CLIENTS ... "
}
run_test 70a "check multi client t-f"
+check_for_process () {
+ local clients=$1
+ shift
+ local prog=$@
+
+ killall_process $clients "$prog" -0
+}
+
+killall_process () {
+ local clients=${1:-$(hostname)}
+ local name=$2
+ local signal=$3
+ local rc=0
+
+ do_nodes $clients "killall $signal $name"
+}
+
test_70b () {
local clients=${CLIENTS:-$HOSTNAME}
- zconf_mount_clients $clients $DIR
-
+ zconf_mount_clients $clients $MOUNT
+
local duration=300
- [ "$SLOW" = "no" ] && duration=60
+ [ "$SLOW" = "no" ] && duration=120
+ # set duration to 900 because it takes some time to boot node
+ [ "$FAILURE_MODE" = HARD ] && duration=900
+
+ local elapsed
+ local start_ts=$(date +%s)
local cmd="rundbench 1 -t $duration"
- local PID=""
- do_nodes $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
- PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
+ local pid=""
+ do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
+ PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \
- LCTL=$LCTL $cmd" &
- PID=$!
- log "Started rundbench load PID=$PID ..."
- ELAPSED=0
- NUM_FAILOVERS=0
- START_TS=$(date +%s)
- CURRENT_TS=$START_TS
- while [ $ELAPSED -lt $duration ]; do
+ MOUNT=$MOUNT DIR=$DIR/$tdir/\\\$(hostname) LCTL=$LCTL $cmd" &
+ pid=$!
+
+ #LU-1897 wait for all dbench copies to start
+ while ! check_for_process $clients dbench; do
+ elapsed=$(($(date +%s) - start_ts))
+ if [ $elapsed -gt $duration]; then
+ killall_process $clients dbench
+ error "dbench failed to start on $clients!"
+ fi
+ sleep 1
+ done
+
+ log "Started rundbench load pid=$pid ..."
+
+ elapsed=$(($(date +%s) - start_ts))
+ local num_failovers=0
+ while [ $elapsed -lt $duration ]; do
+ if ! check_for_process $clients dbench; then
+ error_noexit "dbench stopped on some of $clients!"
+ killall_process $clients dbench
+ break
+ fi
sleep 1
replay_barrier $SINGLEMDS
sleep 1 # give clients a time to do operations
# Increment the number of failovers
- NUM_FAILOVERS=$((NUM_FAILOVERS+1))
- log "$TESTNAME fail mds1 $NUM_FAILOVERS times"
+ num_failovers=$((num_failovers+1))
+ log "$TESTNAME fail $SINGLEMDS $num_failovers times"
fail $SINGLEMDS
- CURRENT_TS=$(date +%s)
- ELAPSED=$((CURRENT_TS - START_TS))
+ elapsed=$(($(date +%s) - start_ts))
done
- wait $PID || error "rundbench load on $CLIENTS failed!"
+
+ wait $pid || error "rundbench load on $clients failed!"
}
run_test 70b "mds recovery; $CLIENTCOUNT clients"
# end multi-client tests
rm -f $DIR/$tfile
replay_barrier $SINGLEMDS
-#define OBD_FAIL_LDLM_ENQUEUE 0x302
+#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000302"
fail $SINGLEMDS
kill -USR1 $pid
test_74() {
local clients=${CLIENTS:-$HOSTNAME}
- stop ost1
zconf_umount_clients $clients $MOUNT
+ stop ost1
facet_failover $SINGLEMDS
zconf_mount_clients $clients $MOUNT
mount_facet ost1
touch $DIR/$tfile || return 1
rm $DIR/$tfile || return 2
- client_df || error "df failed: $?"
+ clients_up || error "client evicted: $?"
return 0
}
run_test 74 "Ensure applications don't fail waiting for OST recovery"
+remote_dir_check_80() {
+ local MDTIDX=1
+ local diridx=$($GETSTRIPE -M $remote_dir)
+ [ $diridx -eq $MDTIDX ] || error "$diridx != $MDTIDX"
+
+ createmany -o $remote_dir/f-%d 20 || error "creation failed"
+ local fileidx=$($GETSTRIPE -M $remote_dir/f-1)
+ [ $fileidx -eq $MDTIDX ] || error "$fileidx != $MDTIDX"
+
+ return 0
+}
+
test_80a() {
- [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
- mkdir -p $DIR/$tdir
- replay_barrier mds2
- $CHECKSTAT -t dir $DIR/$tdir || error "$CHECKSTAT -t dir $DIR/$tdir failed"
- rmdir $DIR/$tdir || error "rmdir $DIR/$tdir failed"
- fail mds2
- stat $DIR/$tdir 2&>/dev/null && error "$DIR/$tdir still exist after recovery!"
- return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x1701
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "remote creation failed"
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
}
-run_test 80a "CMD: unlink cross-node dir (fail mds with inode)"
+run_test 80a "DNE: create remote dir, drop update rep from MDT1, fail MDT1"
test_80b() {
- [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
- mkdir -p $DIR/$tdir
- replay_barrier mds1
- $CHECKSTAT -t dir $DIR/$tdir || error "$CHECKSTAT -t dir $DIR/$tdir failed"
- rmdir $DIR/$tdir || error "rmdir $DIR/$tdir failed"
- fail mds1
- stat $DIR/$tdir 2&>/dev/null && error "$DIR/$tdir still exist after recovery!"
- return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x1701
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX}
+
+ wait $CLIENT_PID || error "remote creation failed"
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
}
-run_test 80b "CMD: unlink cross-node dir (fail mds with name)"
+run_test 80b "DNE: create remote dir, drop update rep from MDT1, fail MDT0"
+
+test_80c() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x1701
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX}
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "remote creation failed"
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 80c "DNE: create remote dir, drop update rep from MDT1, fail MDT[0,1]"
+
+test_80d() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x1701
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX},mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "remote creation failed"
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 80d "DNE: create remote dir, drop update rep from MDT1, fail 2 MDTs"
+
+test_80e() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x119
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX}
+
+ wait $CLIENT_PID || error "remote creation failed"
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 80e "DNE: create remote dir, drop MDT0 rep, fail MDT0"
+
+test_80f() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x119
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "remote creation failed"
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 80f "DNE: create remote dir, drop MDT0 rep, fail MDT1"
+
+test_80g() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x119
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX}
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "remote creation failed"
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 80g "DNE: create remote dir, drop MDT0 rep, fail MDT0, then MDT1"
+
+test_80h() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x119
+ $LFS mkdir -i $MDTIDX $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX},mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || return 1
+
+ remote_dir_check_80 || error "remote dir check failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 80h "DNE: create remote dir, drop MDT0 rep, fail 2 MDTs"
test_81a() {
- [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ touch $remote_dir
+ # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81a "DNE: unlink remote dir, drop MDT0 update rep, fail MDT1"
+
+test_81b() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX}
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81b "DNE: unlink remote dir, drop MDT0 update reply, fail MDT0"
+
+test_81c() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX}
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81c "DNE: unlink remote dir, drop MDT0 update reply, fail MDT0,MDT1"
+
+test_81d() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701
+ do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX},mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81d "DNE: unlink remote dir, drop MDT0 update reply, fail 2 MDTs"
+
+test_81e() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
+
+ fail mds${MDTIDX}
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81e "DNE: unlink remote dir, drop MDT1 req reply, fail MDT0"
+
+test_81f() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81f "DNE: unlink remote dir, drop MDT1 req reply, fail MDT1"
+
+test_81g() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX}
+ fail mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81g "DNE: unlink remote dir, drop req reply, fail M0, then M1"
+
+test_81h() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ local MDTIDX=1
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
+
+ # OBD_FAIL_MDS_REINT_NET_REP 0x119
+ do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
+ rmdir $remote_dir &
+ local CLIENT_PID=$!
+
+ fail mds${MDTIDX},mds$((MDTIDX + 1))
+
+ wait $CLIENT_PID || error "rm remote dir failed"
+
+ stat $remote_dir 2&>/dev/null && error "$remote_dir still exist!"
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 81h "DNE: unlink remote dir, drop request reply, fail 2 MDTs"
+
+test_83a() {
mkdir -p $DIR/$tdir
- createmany -o $DIR/$tdir/f 3000 || error "createmany failed"
- sleep 10
- $CHECKSTAT -t dir $DIR/$tdir || error "$CHECKSTAT -t dir failed"
- $CHECKSTAT -t file $DIR/$tdir/f1002 || error "$CHECKSTAT -t file failed"
- replay_barrier mds1
- rm $DIR/$tdir/f1002 || error "rm $DIR/$tdir/f1002 failed"
- fail mds1
- stat $DIR/$tdir/f1002
-}
-run_test 81a "CMD: unlink cross-node file (fail mds with name)"
-
-test_82a() {
- [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
-
- local dir=$DIR/d82a
- replay_barrier mds2
- mkdir $dir || error "mkdir $dir failed"
- log "FAILOVER mds2"
- fail mds2
- stat $DIR
- $CHECKSTAT -t dir $dir || error "$CHECKSTAT -t dir $dir failed"
-}
-run_test 82a "CMD: mkdir cross-node dir (fail mds with inode)"
-
-test_82b() {
- [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
-
- local dir=$DIR/d82b
- replay_barrier mds1
- mkdir $dir || error "mkdir $dir failed"
- log "FAILOVER mds1"
- fail mds1
- stat $DIR
- $CHECKSTAT -t dir $dir || error "$CHECKSTAT -t dir $dir failed"
-}
-run_test 82b "CMD: mkdir cross-node dir (fail mds with name)"
-
-test_84() {
-#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x143
- do_facet mds "lctl set_param fail_loc=0x80000143"
+ createmany -o $DIR/$tdir/$tfile- 10 || return 1
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000140"
+ unlinkmany $DIR/$tdir/$tfile- 10 || return 2
+}
+run_test 83a "fail log_add during unlink recovery"
+
+test_83b() {
+ mkdir -p $DIR/$tdir
+ createmany -o $DIR/$tdir/$tfile- 10 || return 1
+ replay_barrier $SINGLEMDS
+ unlinkmany $DIR/$tdir/$tfile- 10 || return 2
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000140"
+ fail $SINGLEMDS
+}
+run_test 83b "fail log_add during unlink recovery"
+
+test_84a() {
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000144"
createmany -o $DIR/$tfile- 1 &
PID=$!
mds_evict_client
wait $PID
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
+}
+run_test 84a "stale open during export disconnect"
+
+test_85a() { #bug 16774
+ lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
+
+ for i in `seq 100`; do
+ echo "tag-$i" > $DIR/$tfile-$i
+ grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i"
+ done
+
+ lov_id=`lctl dl | grep "clilov"`
+ addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'`
+ count=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count`
+ echo "before recovery: unused locks count = $count"
+
+ fail $SINGLEMDS
+
+ count2=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count`
+ echo "after recovery: unused locks count = $count2"
+
+ if [ $count2 -ge $count ]; then
+ error "unused locks are not canceled"
+ fi
+}
+run_test 85a "check the cancellation of unused locks during recovery(IBITS)"
+
+test_85b() { #bug 16774
+ lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
+
+ do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME || return 1
+ do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $FSNAME-OST0000 || return 2
+
+ $SETSTRIPE -c 1 -p $FSNAME.$TESTNAME $DIR
+
+ for i in `seq 100`; do
+ dd if=/dev/urandom of=$DIR/$tfile-$i bs=4096 count=32 >/dev/null 2>&1
+ done
+
+ cancel_lru_locks osc
+
+ for i in `seq 100`; do
+ dd if=$DIR/$tfile-$i of=/dev/null bs=4096 count=32 >/dev/null 2>&1
+ done
+
+ lov_id=`lctl dl | grep "clilov"`
+ addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'`
+ count=`lctl get_param -n ldlm.namespaces.*OST0000*$addr.lock_unused_count`
+ echo "before recovery: unused locks count = $count"
+ [ $count != 0 ] || return 3
+
+ fail ost1
+
+ count2=`lctl get_param -n ldlm.namespaces.*OST0000*$addr.lock_unused_count`
+ echo "after recovery: unused locks count = $count2"
+
+ do_facet mgs $LCTL pool_remove $FSNAME.$TESTNAME $FSNAME-OST0000 || return 4
+ do_facet mgs $LCTL pool_destroy $FSNAME.$TESTNAME || return 5
+
+ if [ $count2 -ge $count ]; then
+ error "unused locks are not canceled"
+ fi
+}
+run_test 85b "check the cancellation of unused locks during recovery(EXTENT)"
+
+test_86() {
+ local clients=${CLIENTS:-$HOSTNAME}
+
+ zconf_umount_clients $clients $MOUNT
+ do_facet $SINGLEMDS lctl set_param mdt.${FSNAME}-MDT*.exports.clear=0
+ remount_facet $SINGLEMDS
+ zconf_mount_clients $clients $MOUNT
+}
+run_test 86 "umount server after clear nid_stats should not hit LBUG"
+
+test_87() {
+ do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
+
+ replay_barrier ost1
+ $SETSTRIPE -i 0 -c 1 $DIR/$tfile
+ dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 || error "Cannot write"
+ cksum=`md5sum $DIR/$tfile | awk '{print $1}'`
+ cancel_lru_locks osc
+ fail ost1
+ dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
+ cksum2=`md5sum $DIR/$tfile | awk '{print $1}'`
+ if [ $cksum != $cksum2 ] ; then
+ error "New checksum $cksum2 does not match original $cksum"
+ fi
+}
+run_test 87 "write replay"
+
+test_87b() {
+ do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
+
+ replay_barrier ost1
+ $SETSTRIPE -i 0 -c 1 $DIR/$tfile
+ dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 || error "Cannot write"
+ sleep 1 # Give it a chance to flush dirty data
+ echo TESTTEST | dd of=$DIR/$tfile bs=1 count=8 seek=64
+ cksum=`md5sum $DIR/$tfile | awk '{print $1}'`
+ cancel_lru_locks osc
+ fail ost1
+ dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
+ cksum2=`md5sum $DIR/$tfile | awk '{print $1}'`
+ if [ $cksum != $cksum2 ] ; then
+ error "New checksum $cksum2 does not match original $cksum"
+ fi
+}
+run_test 87b "write replay with changed data (checksum resend)"
+
+test_88() { #bug 17485
+ mkdir -p $DIR/$tdir
+ mkdir -p $TMP/$tdir
+
+ $SETSTRIPE -i 0 -c 1 $DIR/$tdir || error "$SETSTRIPE"
+
+ replay_barrier ost1
+ replay_barrier $SINGLEMDS
+
+ # exhaust precreations on ost1
+ local OST=$(ostname_from_index 0)
+ local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $OST)
+ local last_id=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_last_id)
+ local next_id=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_next_id)
+ echo "before test: last_id = $last_id, next_id = $next_id"
+
+ echo "Creating to objid $last_id on ost $OST..."
+ createmany -o $DIR/$tdir/f-%d $next_id $((last_id - next_id + 2))
+
+ #create some files to use some uncommitted objids
+ last_id=$(($last_id + 1))
+ createmany -o $DIR/$tdir/f-%d $last_id 8
+
+ last_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_last_id)
+ next_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_next_id)
+ echo "before recovery: last_id = $last_id2, next_id = $next_id2"
+
+ # if test uses shutdown_facet && reboot_facet instead of facet_failover ()
+ # it has to take care about the affected facets, bug20407
+ local affected_mds1=$(affected_facets mds1)
+ local affected_ost1=$(affected_facets ost1)
+
+ shutdown_facet $SINGLEMDS
+ shutdown_facet ost1
+
+ reboot_facet $SINGLEMDS
+ change_active $affected_mds1
+ wait_for_facet $affected_mds1
+ mount_facets $affected_mds1 || error "Restart of mds failed"
+
+ reboot_facet ost1
+ change_active $affected_ost1
+ wait_for_facet $affected_ost1
+ mount_facets $affected_ost1 || error "Restart of ost1 failed"
+
+ clients_up
+
+ last_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_last_id)
+ next_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_next_id)
+ echo "after recovery: last_id = $last_id2, next_id = $next_id2"
+
+ # create new files, which should use new objids, and ensure the orphan
+ # cleanup phase for ost1 is completed at the same time
+ for i in `seq 8`; do
+ file_id=$(($last_id + 10 + $i))
+ dd if=/dev/urandom of=$DIR/$tdir/f-$file_id bs=4096 count=128
+ done
+
+ # if the objids were not recreated, then "ls" will failed for -ENOENT
+ ls -l $DIR/$tdir/* || error "can't get the status of precreated files"
+
+ local file_id
+ # write into previously created files
+ for i in `seq 8`; do
+ file_id=$(($last_id + $i))
+ dd if=/dev/urandom of=$DIR/$tdir/f-$file_id bs=4096 count=128
+ cp -f $DIR/$tdir/f-$file_id $TMP/$tdir/
+ done
+
+ # compare the content
+ for i in `seq 8`; do
+ file_id=$(($last_id + $i))
+ cmp $TMP/$tdir/f-$file_id $DIR/$tdir/f-$file_id || error "the content" \
+ "of file is modified!"
+ done
+
+ rm -fr $TMP/$tdir
+}
+run_test 88 "MDS should not assign same objid to different files "
+
+test_89() {
+ cancel_lru_locks osc
+ mkdir -p $DIR/$tdir
+ rm -f $DIR/$tdir/$tfile
+ wait_mds_ost_sync
+ wait_delete_completed
+ BLOCKS1=$(df -P $MOUNT | tail -n 1 | awk '{ print $3 }')
+ $SETSTRIPE -i 0 -c 1 $DIR/$tdir/$tfile
+ dd if=/dev/zero bs=1M count=10 of=$DIR/$tdir/$tfile
+ sync
+ stop ost1
+ facet_failover $SINGLEMDS
+ rm $DIR/$tdir/$tfile
+ umount $MOUNT
+ mount_facet ost1
+ zconf_mount $(hostname) $MOUNT
+ client_up || return 1
+ wait_mds_ost_sync
+ wait_delete_completed
+ BLOCKS2=$(df -P $MOUNT | tail -n 1 | awk '{ print $3 }')
+ [ $((BLOCKS2 - BLOCKS1)) -le 4 ] || \
+ error $((BLOCKS2 - BLOCKS1)) blocks leaked
+}
+
+run_test 89 "no disk space leak on late ost connection"
+
+cleanup_90 () {
+ local facet=$1
+ trap 0
+ reboot_facet $facet
+ change_active $facet
+ wait_for_facet $facet
+ mount_facet $facet || error "Restart of $facet failed"
+ clients_up
+}
+
+test_90() { # bug 19494
+ local dir=$DIR/$tdir
+ local ostfail=$(get_random_entry $(get_facets OST))
+
+ if [[ $FAILURE_MODE = HARD ]]; then
+ local affected=$(affected_facets $ostfail);
+ if [[ "$affected" != $ostfail ]]; then
+ skip not functional with FAILURE_MODE=$FAILURE_MODE, affected: $affected
+ return 0
+ fi
+ fi
+
+ mkdir -p $dir
+
+ echo "Create the files"
+
+ # file "f${index}" striped over 1 OST
+ # file "all" striped over all OSTs
+
+ $SETSTRIPE -c $OSTCOUNT $dir/all ||
+ error "setstripe failed to create $dir/all"
+
+ for (( i=0; i<$OSTCOUNT; i++ )); do
+ local f=$dir/f$i
+ $SETSTRIPE -i $i -c 1 $f || error "$SETSTRIPE failed to create $f"
+
+ # confirm setstripe actually created the stripe on the requested OST
+ local uuid=$(ostuuid_from_index $i)
+ for file in f$i all; do
+ if [[ $dir/$file != $($LFS find --obd $uuid --name $file $dir) ]]; then
+ $GETSTRIPE $dir/file
+ error wrong stripe: $file, uuid: $uuid
+ fi
+ done
+ done
+
+ # Before failing an OST, get its obd name and index
+ local varsvc=${ostfail}_svc
+ local obd=$(do_facet $ostfail lctl get_param -n obdfilter.${!varsvc}.uuid)
+ local index=$(($(facet_number $ostfail) - 1))
+
+ echo "Fail $ostfail $obd, display the list of affected files"
+ shutdown_facet $ostfail || return 2
+
+ trap "cleanup_90 $ostfail" EXIT INT
+ echo "General Query: lfs find $dir"
+ local list=$($LFS find $dir)
+ echo "$list"
+ for (( i=0; i<$OSTCOUNT; i++ )); do
+ list_member "$list" $dir/f$i || error_noexit "lfs find $dir: no file f$i"
+ done
+ list_member "$list" $dir/all || error_noexit "lfs find $dir: no file all"
+
+ # focus on the missing OST,
+ # we expect to see only two files affected: "f$(index)" and "all"
+
+ echo "Querying files on shutdown $ostfail: lfs find --obd $obd"
+ list=$($LFS find --obd $obd $dir)
+ echo "$list"
+ for file in all f$index; do
+ list_member "$list" $dir/$file ||
+ error_noexit "lfs find does not report the affected $obd for $file"
+ done
+
+ [[ $(echo $list | wc -w) -eq 2 ]] ||
+ error_noexit "lfs find reports the wrong list of affected files ${#list[@]}"
+
+ echo "Check getstripe: $GETSTRIPE -r --obd $obd"
+ list=$($GETSTRIPE -r --obd $obd $dir)
+ echo "$list"
+ for file in all f$index; do
+ echo "$list" | grep $dir/$file ||
+ error_noexit "lfs getsripe does not report the affected $obd for $file"
+ done
+
+ cleanup_90 $ostfail
}
-run_test 84 "stale open during export disconnect"
+run_test 90 "lfs find identifies the missing striped file segments"
-equals_msg `basename $0`: test complete, cleaning up
+complete $SECONDS
check_and_cleanup_lustre
-[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
+exit_status