. $LUSTRE/tests/test-framework.sh
init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
CHECK_GRANT=${CHECK_GRANT:-"yes"}
GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
-remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
# Skip these tests
-# bug number: 17466 15962
-ALWAYS_EXCEPT="61d $REPLAY_SINGLE_EXCEPT"
+# bug number: 17466 18857
+ALWAYS_EXCEPT="61d 33a 33b $REPLAY_SINGLE_EXCEPT"
if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
CONFIG_EXCEPTIONS="0b 42 47 61a 61c"
lfs getstripe $DIR/$tfile || return 1
rm -f $DIR/$tfile || return 2 # make it an orphan
mds_evict_client
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
fail $SINGLEMDS # start orphan recovery
- df -P $DIR || df -P $DIR || true # reconnect
wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
-
- # just because recovery is done doesn't mean we've finished
- # orphan cleanup. Wait for llogs to get synchronized.
- echo waiting for orphan cleanup...
- while [ true ]; do
- local -a sync=($(do_facet ost "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}'))
- local con=1
- for ((i=0; i<${#sync[@]}; i++)); do
- [ ${sync[$i]} -eq 0 ] && continue
- # there is a not finished MDS-OST synchronization
- con=0
- break;
- done
- [ ${con} -eq 1 ] && break
- sleep 1
- done
-
- # let the statfs cache to get old enough.
- sleep 1
-
+ wait_mds_ost_sync || return 3
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
log "before $BEFOREUSED, after $AFTERUSED"
[ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
ls -la $DIR/$tfile
mds_evict_client
-
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
kill -USR1 $pid
- test -s $DIR/$tfile || error "File was truncated"
-
wait $pid || return 1
+ [ -s $DIR/$tfile ] || error "File was truncated"
+
return 0
}
run_test 20c "check that client eviction does not affect file content"
multiop_bg_pause $DIR/$tfile O_c || return 3
pid2=$!
mds_evict_client
- df $MOUNT || sleep 1 && df $MOUNT || return 1
+ client_up || client_up || return 1
kill -USR1 $pid1
kill -USR1 $pid2
wait $pid1 || return 4
sync
return 0
}
+start_full_debug_logging
run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)"
+stop_full_debug_logging
test_38() {
createmany -o $DIR/$tfile-%d 800
}
run_test 43 "mds osc import failure during recovery; don't LBUG"
-test_44a() { # was test_44
+test_44a() { # was test_44
local at_max_saved=0
mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; }
# adaptive timeouts slow this way down
if at_is_enabled; then
fi
for i in `seq 1 10`; do
- echo "$i of 10 ($(date +%s))"
- do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
- #define OBD_FAIL_TGT_CONN_RACE 0x701
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
- $LCTL --device $mdcdev recover
- df $MOUNT
+ echo "$i of 10 ($(date +%s))"
+ do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
+ #define OBD_FAIL_TGT_CONN_RACE 0x701
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
+ # lctl below may fail, it is valid case
+ $LCTL --device $mdcdev recover
+ df $MOUNT
done
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
[ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds
run_test 44a "race in target handle connect"
test_44b() {
- mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
+ local mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; }
+
for i in `seq 1 10`; do
echo "$i of 10 ($(date +%s))"
- do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
- #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
- $LCTL --device $mdcdev recover
- df $MOUNT
+ do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service"
+ #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
+ # lctl below may fail, it is valid case
+ $LCTL --device $mdcdev recover
+ df $MOUNT
done
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
return 0
# Handle failed close
test_45() {
mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`
- [ "$mdcdev" ] || exit 2
- $LCTL --device $mdcdev recover
+ [ "$mdcdev" ] || return 2
+ [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; }
+
+ $LCTL --device $mdcdev recover || return 6
multiop_bg_pause $DIR/$tfile O_c || return 1
pid=$!
# OBD_FAIL_OST_CREATE_NET 0x204
fail ost1
do_facet ost1 "lctl set_param fail_loc=0x80000204"
- df $MOUNT || return 2
+ client_up || return 2
# let the MDS discover the OST failure, attempt to recover, fail
# and recover again.
# OBD_FAIL_OST_EROFS 0x216
facet_failover $SINGLEMDS
do_facet ost1 "lctl set_param fail_loc=0x80000216"
- df $MOUNT || return 2
+ client_up || return 2
createmany -o $DIR/$tfile 20 20 || return 2
unlinkmany $DIR/$tfile 40 || return 3
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
run_test 53c "|X| open request and close request while two MDC requests in flight"
test_53d() {
+ cancel_lru_locks mdc # cleanup locks from former test cases
rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2
mkdir -p $DIR/${tdir}-1
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
kill -USR1 $close_pid
cancel_lru_locks mdc # force the close
-
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
+
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
cancel_lru_locks mdc # force the close
sleep 1
+ #bz20647: make sure all pids are exists before failover
+ [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
+ [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
replay_barrier_nodf $SINGLEMDS
fail_nodf $SINGLEMDS
wait $open_pid || return 1
createmany -o $DIR/$tdir/$tfile- 25
#define OBD_FAIL_TGT_REPLAY_DROP 0x707
do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707"
- facet_failover $SINGLEMDS
- df $MOUNT || return 1
+ fail $SINGLEMDS
do_facet $SINGLEMDS "lctl set_param fail_loc=0"
unlinkmany $DIR/$tdir/$tfile- 25 || return 2
return 0
at_start || return 0
CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
+
+ # exhaust precreations on ost1
+ local OST=$(lfs osts | grep 0": " | awk '{print $2}' | sed -e 's/_UUID$//')
+ local mdtosc=$(get_mdtosc_proc_path $OST)
+ local last_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id)
+ local next_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id)
+
+ mkdir -p $DIR/$tdir/${OST}
+ lfs setstripe $DIR/$tdir/${OST} -o 0 -c 1 || error "setstripe"
+ echo "Creating to objid $last_id on ost $OST..."
#define OBD_FAIL_OST_PAUSE_CREATE 0x223
do_facet ost1 "sysctl -w lustre.fail_val=20000"
do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
- cp /etc/profile $DIR/$tfile || error "cp failed"
+ createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2))
+
client_reconnect
do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
log "phase 2"
[ "$SLOW" = "no" ] && duration=60
local cmd="rundbench 1 -t $duration"
local PID=""
- do_nodes --verbose $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
+ do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \
LCTL=$LCTL $cmd" &
mount_facet ost1
touch $DIR/$tfile || return 1
rm $DIR/$tfile || return 2
- client_df || error "df failed: $?"
+ clients_up || error "client evicted: $?"
return 0
}
run_test 74 "Ensure applications don't fail waiting for OST recovery"
test_80a() {
- [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs" && return 0
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
mkdir -p $DIR/$tdir
replay_barrier mds2
run_test 83b "fail log_add during unlink recovery"
test_84a() {
-#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x143
- do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000143"
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000144"
createmany -o $DIR/$tfile- 1 &
PID=$!
mds_evict_client
wait $PID
- df -P $DIR || df -P $DIR || true # reconnect
+ client_up || client_up || true # reconnect
}
run_test 84a "stale open during export disconnect"