df -P $DIR || df -P $DIR || true # reconnect
wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
- # FIXME just because recovery is done doesn't mean we've finished
- # orphan cleanup. Fake it with a sleep for now...
- sleep 10
+ # just because recovery is done doesn't mean we've finished
+ # orphan cleanup. Wait for llogs to get synchronized.
+ echo waiting for orphan cleanup...
+ while [ true ]; do
+ local -a sync=($(do_facet ost "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}'))
+ local con=1
+ for ((i=0; i<${#sync[@]}; i++)); do
+ [ ${sync[$i]} -eq 0 ] && continue
+ # there is a not finished MDS-OST synchronization
+ con=0
+ break;
+ done
+ [ ${con} -eq 1 ] && break
+ sleep 1
+ done
+
+ # let the statfs cache to get old enough.
+ sleep 1
+
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
log "before $BEFOREUSED, after $AFTERUSED"
[ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
[ "$mdcdev" ] || exit 2
# adaptive timeouts slow this way down
- if at_is_valid && at_is_enabled; then
+ if at_is_enabled; then
at_max_saved=$(at_max_get mds)
at_max_set 40 mds
fi
echo "Cleaning up AT ..."
if [ -n "$ATOLDBASE" ]; then
- local at_history=$(do_facet mds "find /sys/ -name at_history")
- do_facet mds "echo $ATOLDBASE >> $at_history" || true
- do_facet ost1 "echo $ATOLDBASE >> $at_history" || true
+ local at_history=$($LCTL get_param -n at_history)
+ do_facet mds "lctl set_param at_history=$at_history" || true
+ do_facet ost1 "lctl set_param at_history=$at_history" || true
fi
if [ $AT_MAX_SET -ne 0 ]; then
at_start()
{
local at_max_new=600
- if ! at_is_valid; then
- skip "AT env is invalid"
- return 1
- fi
# Save at_max original values
local facet
done
if [ -z "$ATOLDBASE" ]; then
- local at_history=$(do_facet mds "find /sys/ -name at_history")
- [ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1
- ATOLDBASE=$(do_facet mds "cat $at_history")
+ ATOLDBASE=$(do_facet mds "lctl get_param -n at_history")
# speed up the timebase so we can check decreasing AT
- do_facet mds "echo 8 >> $at_history"
- do_facet ost1 "echo 8 >> $at_history"
+ do_facet mds "lctl set_param at_history=8" || true
+ do_facet ost1 "lctl set_param at_history=8" || true
# sleep for a while to cool down, should be > 8s and also allow
# at least one ping to be sent. simply use TIMEOUT to be safe.
do_facet ost1 "sysctl -w lustre.fail_loc=0"
CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
ATTEMPTS=$(($CONN2 - $CONN1))
- echo "$ATTEMPTS osc reconnect attemps on gradual slow"
+ echo "$ATTEMPTS osc reconnect attempts on gradual slow"
[ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect"
return 0
}
log "phase 2"
CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
ATTEMPTS=$(($CONN2 - $CONN1))
- echo "$ATTEMPTS osc reconnect attemps on instant slow"
+ echo "$ATTEMPTS osc reconnect attempts on instant slow"
# do it again; should not timeout
do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
cp /etc/profile $DIR/$tfile || error "cp failed"
do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
ATTEMPTS=$(($CONN3 - $CONN2))
- echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
+ echo "$ATTEMPTS osc reconnect attempts on 2nd slow"
[ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
return 0
}
zconf_mount_clients $clients $DIR
- local duration=120
+ local duration=300
[ "$SLOW" = "no" ] && duration=60
local cmd="rundbench 1 -t $duration"
local PID=""
LCTL=$LCTL $cmd" &
PID=$!
log "Started rundbench load PID=$PID ..."
-
- sleep $((duration / 4))
- replay_barrier $SINGLEMDS
- sleep 3 # give clients a time to do operations
-
- log "$TESTNAME fail mds 1"
- fail $SINGLEMDS
-
+ ELAPSED=0
+ NUM_FAILOVERS=0
+ START_TS=$(date +%s)
+ CURRENT_TS=$START_TS
+ while [ $ELAPSED -lt $duration ]; do
+ sleep 1
+ replay_barrier $SINGLEMDS
+ sleep 1 # give clients a time to do operations
+ # Increment the number of failovers
+ NUM_FAILOVERS=$((NUM_FAILOVERS+1))
+ log "$TESTNAME fail mds1 $NUM_FAILOVERS times"
+ facet_failover $SINGLEMDS
+ CURRENT_TS=$(date +%s)
+ ELAPSED=$((CURRENT_TS - START_TS))
+ done
wait $PID || error "rundbench load on $CLIENTS failed!"
}
run_test 70b "mds recovery; $CLIENTCOUNT clients"
mount_facet ost1
touch $DIR/$tfile || return 1
rm $DIR/$tfile || return 2
+ df $MOUNT || error "df failed: $?"
return 0
}
run_test 74 "Ensure applications don't fail waiting for OST reocvery"
}
run_test 82b "CMD: mkdir cross-node dir (fail mds with name)"
+test_84() {
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x143
+ do_facet mds "lctl set_param fail_loc=0x80000143"
+ createmany -o $DIR/$tfile- 1 &
+ PID=$!
+ mds_evict_client
+ wait $PID
+ df -P $DIR || df -P $DIR || true # reconnect
+}
+run_test 84 "stale open during export disconnect"
+
equals_msg `basename $0`: test complete, cleaning up
check_and_cleanup_lustre
[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true