set -e
-# bug number: LU-2012 10124
-ALWAYS_EXCEPT="14b 15c $REPLAY_DUAL_EXCEPT"
+# bug number: LU-2012 10124 LU-7372 LU-8333
+ALWAYS_EXCEPT="14b 15c 26 21b $REPLAY_DUAL_EXCEPT"
SAVE_PWD=$PWD
PTLDEBUG=${PTLDEBUG:--1}
remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+# 7 (min)"
[ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
[[ $(facet_fstype $SINGLEMDS) == zfs ]] &&
local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug)
do_facet $SINGLEMDS lctl set_param debug=+dlmtrace
mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed"
- touch $MOUNT1/$tdir/$tfile
- #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
- statmany -s $MOUNT1/$tdir/f 1 500 &
+ touch $MOUNT1/$tdir/${tfile}0 || error "touch file failed"
+ statmany -s $MOUNT1/$tdir/$tfile 1 500 &
OPENPID=$!
- NOW=$(date +%s)
+ NOW=$SECONDS
+ #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
sleep 1
#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
local tier2=$((SECONDS - before))
- # timeout is more than 2.25x original timeout
- ((tier2 < tier1 * 9 / 4)) ||
- error "recovery time $tier2 >= 2.25x original time $tier1"
+ # timeout is more than 1.5x original timeout
+ ((tier2 < tier1 * 6 / 4)) ||
+ error "recovery time $tier2 >= 1.5x original time $tier1"
}
run_test 20 "recovery time is not increasing"
test_21b_sub $facet || error "Not all renames are replayed. COS=$COS"
- # COS disabled (should fail)
- COS=0
- do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
-
# there is still a window when transactions may be written to disk
# before the mds device is set R/O. To avoid such a rare test failure,
# the check is repeated several times.
+ COS=0
local n_attempts=1
while true; do
+ # COS disabled (should fail)
+ do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
+
test_21b_sub $facet || break
n_attempts=$((n_attempts + 1))
[ $n_attempts -gt 3 ] &&
drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
sleep 1
-#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
# failover, replay and resend replayed waiting locks
- do_facet ost1 lctl set_param fail_loc=0x80000213
+ if [ $(lustre_version_code ost1) -ge $(version_code 2.6.90) ]; then
+ #define OBD_FAIL_LDLM_SRV_CP_AST 0x325
+ do_facet ost1 lctl set_param fail_loc=0x80000325
+ else
+ #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
+ do_facet ost1 lctl set_param fail_loc=0x80000213
+ fi
+
fail ost1
# multiop does not finish because CP AST is skipped;
}
run_test 25 "replay|resend"
+cleanup_26() {
+ trap 0
+ kill -9 $tar_26_pid
+ kill -9 $dbench_26_pid
+ killall -9 dbench
+}
+
+test_26() {
+ local clients=${CLIENTS:-$HOSTNAME}
+
+ zconf_mount_clients $clients $MOUNT
+
+ local duration=600
+ [ "$SLOW" = "no" ] && duration=200
+ # set duration to 900 because it takes some time to boot node
+ [ "$FAILURE_MODE" = HARD ] && duration=900
+
+ local start_ts=$SECONDS
+ local rc=0
+
+ trap cleanup_26 EXIT
+ (
+ local tar_dir=$DIR/$tdir/run_tar
+ while true; do
+ test_mkdir -p -c$MDSCOUNT $tar_dir || break
+ if [ $MDSCOUNT -ge 2 ]; then
+ $LFS setdirstripe -D -c$MDSCOUNT $tar_dir ||
+ error "set default dirstripe failed"
+ fi
+ cd $tar_dir || break
+ tar cf - /etc | tar xf - || error "tar failed"
+ cd $DIR/$tdir || break
+ rm -rf $tar_dir || break
+ done
+ )&
+ tar_26_pid=$!
+ echo "Started tar $tar_26_pid"
+
+ (
+ local dbench_dir=$DIR2/$tdir/run_dbench
+ while true; do
+ test_mkdir -p -c$MDSCOUNT $dbench_dir || break
+ if [ $MDSCOUNT -ge 2 ]; then
+ $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir ||
+ error "set default dirstripe failed"
+ fi
+ cd $dbench_dir || break
+ rundbench 1 -D $dbench_dir -t 100 &>/dev/null || break
+ cd $DIR/$tdir || break
+ rm -rf $dbench_dir || break
+ done
+ )&
+ dbench_26_pid=$!
+ echo "Started dbench $dbench_26_pid"
+
+ local num_failovers=0
+ local fail_index=1
+ while [ $((SECONDS - start_ts)) -lt $duration ]; do
+ kill -0 $tar_26_pid || error "tar $tar_26_pid missing"
+ kill -0 $dbench_26_pid || error "dbench $dbench_26_pid missing"
+ sleep 2
+ replay_barrier mds$fail_index
+ sleep 2 # give clients a time to do operations
+ # Increment the number of failovers
+ num_failovers=$((num_failovers + 1))
+ log "$TESTNAME fail mds$fail_index $num_failovers times"
+ fail mds$fail_index
+ if [ $fail_index -ge $MDSCOUNT ]; then
+ fail_index=1
+ else
+ fail_index=$((fail_index + 1))
+ fi
+ done
+ # stop the client loads
+ kill -0 $tar_26_pid || error "tar $tar_26_pid stopped"
+ kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped"
+ cleanup_26 || true
+}
+run_test 26 "dbench and tar with mds failover"
+
+test_28() {
+ $SETSTRIPE -i 0 -c 1 $DIR2/$tfile
+ dd if=/dev/zero of=$DIR2/$tfile bs=4096 count=1
+
+ #define OBD_FAIL_LDLM_SRV_BL_AST 0x324
+ do_facet ost1 $LCTL set_param fail_loc=0x80000324
+
+ dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 &
+ local pid=$!
+ sleep 2
+
+ #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
+ do_facet ost1 $LCTL set_param fail_loc=0x32a
+
+ fail ost1
+
+ sleep 2
+ cancel_lru_locks OST0000-osc
+ wait $pid || error "dd failed"
+}
+run_test 28 "lock replay should be ordered: waiting after granted"
+
complete $SECONDS
-SLEEP=$((`date +%s` - $NOW))
+SLEEP=$((SECONDS - $NOW))
[ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
[ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
check_and_cleanup_lustre