. $LUSTRE/tests/test-framework.sh
init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
CHECK_GRANT=${CHECK_GRANT:-"yes"}
GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
-remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
# Skip these tests
-# bug number: 17466 15962
-ALWAYS_EXCEPT="61d $REPLAY_SINGLE_EXCEPT"
+# bug number: 17466 18857
+ALWAYS_EXCEPT="61d 33a 33b $REPLAY_SINGLE_EXCEPT"
if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
CONFIG_EXCEPTIONS="0b 42 47 61a 61c"
lfs getstripe $DIR/$tfile || return 1
rm -f $DIR/$tfile || return 2 # make it an orphan
mds_evict_client
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
fail $SINGLEMDS # start orphan recovery
- df -P $DIR || df -P $DIR || true # reconnect
wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
-
- # just because recovery is done doesn't mean we've finished
- # orphan cleanup. Wait for llogs to get synchronized.
- echo waiting for orphan cleanup...
- while [ true ]; do
- local -a sync=($(do_facet ost "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}'))
- local con=1
- for ((i=0; i<${#sync[@]}; i++)); do
- [ ${sync[$i]} -eq 0 ] && continue
- # there is a not finished MDS-OST synchronization
- con=0
- break;
- done
- [ ${con} -eq 1 ] && break
- sleep 1
- done
-
- # let the statfs cache to get old enough.
- sleep 1
-
+ wait_mds_ost_sync || return 3
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
log "before $BEFOREUSED, after $AFTERUSED"
[ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
ls -la $DIR/$tfile
mds_evict_client
-
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
kill -USR1 $pid
- test -s $DIR/$tfile || error "File was truncated"
-
wait $pid || return 1
+ [ -s $DIR/$tfile ] || error "File was truncated"
+
return 0
}
run_test 20c "check that client eviction does not affect file content"
multiop_bg_pause $DIR/$tfile O_c || return 3
pid2=$!
mds_evict_client
- df $MOUNT || sleep 1 && df $MOUNT || return 1
+ client_up || client_up || return 1
kill -USR1 $pid1
kill -USR1 $pid2
wait $pid1 || return 4
}
run_test 32 "close() notices client eviction; close() after client eviction"
-# Abort recovery before client complete
-test_33a() { # was test_33
- replay_barrier $SINGLEMDS
- createmany -o $DIR/$tfile-%d 100
+test_33a() {
+ createmany -o $DIR/$tfile-%d 10
+ replay_barrier_nosync $SINGLEMDS
fail_abort $SINGLEMDS
- # this file should be gone, because the replay was aborted
- $CHECKSTAT -t file $DIR/$tfile-* && return 3
- unlinkmany $DIR/$tfile-%d 0 100
+ # recreate shouldn't fail
+ createmany -o $DIR/$tfile--%d 10 || return 1
+ rm $DIR/$tfile-* -f
return 0
}
-run_test 33a "abort recovery before client does replay"
+run_test 33a "fid seq shouldn't be reused after abort recovery"
+
+test_33b() {
+ #define OBD_FAIL_SEQ_ALLOC 0x1311
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x1311"
-# Stale FID sequence bug 15962
-test_33b() { # was test_33a
- replay_barrier $SINGLEMDS
createmany -o $DIR/$tfile-%d 10
+ replay_barrier_nosync $SINGLEMDS
fail_abort $SINGLEMDS
- unlinkmany $DIR/$tfile-%d 0 10
# recreate shouldn't fail
- createmany -o $DIR/$tfile-%d 10 || return 3
- unlinkmany $DIR/$tfile-%d 0 10
+ createmany -o $DIR/$tfile--%d 10 || return 1
+ rm $DIR/$tfile-* -f
return 0
}
-run_test 33b "fid shouldn't be reused after abort recovery"
+run_test 33b "test fid seq allocation"
test_34() {
multiop_bg_pause $DIR/$tfile O_c || return 2
sync
return 0
}
+start_full_debug_logging
run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)"
+stop_full_debug_logging
test_38() {
createmany -o $DIR/$tfile-%d 800
}
run_test 43 "mds osc import failure during recovery; don't LBUG"
-test_44a() { # was test_44
+test_44a() { # was test_44
local at_max_saved=0
mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; }
# adaptive timeouts slow this way down
if at_is_enabled; then
fi
for i in `seq 1 10`; do
- echo "$i of 10 ($(date +%s))"
- do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
- #define OBD_FAIL_TGT_CONN_RACE 0x701
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
- $LCTL --device $mdcdev recover
- df $MOUNT
+ echo "$i of 10 ($(date +%s))"
+ do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
+ #define OBD_FAIL_TGT_CONN_RACE 0x701
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
+ # lctl below may fail, it is valid case
+ $LCTL --device $mdcdev recover
+ df $MOUNT
done
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
[ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds
run_test 44a "race in target handle connect"
test_44b() {
- mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
+ local mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; }
+
for i in `seq 1 10`; do
echo "$i of 10 ($(date +%s))"
- do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
- #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
- $LCTL --device $mdcdev recover
- df $MOUNT
+ do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
+ #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
+ # lctl below may fail, it is valid case
+ $LCTL --device $mdcdev recover
+ df $MOUNT
done
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
return 0
# Handle failed close
test_45() {
mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
- $LCTL --device $mdcdev recover
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; }
+
+ $LCTL --device $mdcdev recover || return 6
multiop_bg_pause $DIR/$tfile O_c || return 1
pid=$!
# OBD_FAIL_OST_CREATE_NET 0x204
fail ost1
do_facet ost1 "lctl set_param fail_loc=0x80000204"
- df $MOUNT || return 2
+ client_up || return 2
# let the MDS discover the OST failure, attempt to recover, fail
# and recover again.
# OBD_FAIL_OST_EROFS 0x216
facet_failover $SINGLEMDS
do_facet ost1 "lctl set_param fail_loc=0x80000216"
- df $MOUNT || return 2
+ client_up || return 2
createmany -o $DIR/$tfile 20 20 || return 2
unlinkmany $DIR/$tfile 40 || return 3
# bug 3462 - simultaneous MDC requests
test_53a() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
mkdir -p $DIR/${tdir}-1
mkdir -p $DIR/${tdir}-2
multiop $DIR/${tdir}-1/f O_c &
run_test 53a "|X| close request while two MDC requests in flight"
test_53b() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
run_test 53b "|X| open request while two MDC requests in flight"
test_53c() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 53c "|X| open request and close request while two MDC requests in flight"
test_53d() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
run_test 53d "|X| close reply while two MDC requests in flight"
test_53e() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
run_test 53e "|X| open reply while two MDC requests in flight"
test_53f() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 53f "|X| open reply and close reply while two MDC requests in flight"
test_53g() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
-
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
+
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 53g "|X| drop open reply and close request while close and open are both in flight"
test_53h() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
cancel_lru_locks mdc # force the close
sleep 1
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
createmany -o $DIR/$tdir/$tfile- 25
#define OBD_FAIL_TGT_REPLAY_DROP 0x707
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707"
- facet_failover $SINGLEMDS
- df $MOUNT || return 1
+ fail $SINGLEMDS
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
unlinkmany $DIR/$tdir/$tfile- 25 || return 2
return 0
at_start || return 0
CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
+
+ # exhaust precreations on ost1
+ local OST=$(lfs osts | grep ^0": " | awk '{print $2}' | sed -e 's/_UUID$//')
+ local mdtosc=$(get_mdtosc_proc_path $OST)
+ local last_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id)
+ local next_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id)
+
+ mkdir -p $DIR/$tdir/${OST}
+ lfs setstripe $DIR/$tdir/${OST} -o 0 -c 1 || error "setstripe"
+ echo "Creating to objid $last_id on ost $OST..."
#define OBD_FAIL_OST_PAUSE_CREATE 0x223
do_facet ost1 "sysctl -w lustre.fail_val=20000"
do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
- cp /etc/profile $DIR/$tfile || error "cp failed"
+ createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2))
+
client_reconnect
do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
log "phase 2"
[ "$SLOW" = "no" ] && duration=60
local cmd="rundbench 1 -t $duration"
local PID=""
- do_nodes --verbose $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
+ do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \
LCTL=$LCTL $cmd" &
mount_facet ost1
touch $DIR/$tfile || return 1
rm $DIR/$tfile || return 2
- client_df || error "df failed: $?"
+ clients_up || error "client evicted: $?"
return 0
}
run_test 74 "Ensure applications don't fail waiting for OST recovery"
test_80a() {
- [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs" && return 0
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
mkdir -p $DIR/$tdir
replay_barrier mds2
run_test 83b "fail log_add during unlink recovery"
test_84a() {
-#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x143
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000143"
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000144"
createmany -o $DIR/$tfile- 1 &
PID=$!
mds_evict_client
wait $PID
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
}
run_test 84a "stale open during export disconnect"
+test_85() { # bug 22190
+ local fail=0
+ do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 1"
+
+ replay_barrier ost1
+ lfs setstripe -i 0 -c 1 $DIR/$tfile
+ dd oflag=dsync if=/dev/urandom of=$DIR/$tfile bs=4k count=100 || fail=1
+ fail_abort ost1
+ echo "FAIL $fail"
+ [ $fail -ne 0 ] || error "Write was successful"
+}
+run_test 85 "ensure there is no reply on bulk write if obd is in rdonly mode"
+
+test_86() {
+ umount $MOUNT
+ do_facet $SINGLEMDS lctl set_param mdt.${FSNAME}-MDT*.exports.clear=0
+ remount_facet $SINGLEMDS
+}
+run_test 86 "umount server after clear nid_stats should not hit LBUG"
+
equals_msg `basename $0`: test complete, cleaning up
check_and_cleanup_lustre
[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true