set -e
-# bug 5493 LU2034
-ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT"
-
export MULTIOP=${MULTIOP:-multiop}
PTLDEBUG=${PTLDEBUG:--1}
LUSTRE=${LUSTRE:-`dirname $0`/..}
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
init_logging
+ALWAYS_EXCEPT="$RECOVERY_SMALL_EXCEPT"
+if [ "$MDSCOUNT" -gt "1" ]; then
+ # bug number for skipped test: LU-10931
+ ALWAYS_EXCEPT+=" 136"
+ # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+fi
+
require_dsh_mds || exit 0
# also long tests: 19, 21a, 21e, 21f, 23, 27
awk '{sub("_UUID", "", $2); print $2;}')
#assume one client
mdccli=$($LCTL dl | grep "${mdtname}-mdc" | awk '{print $4;}')
- conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid)
+ conn_uuid=$($LCTL get_param -n mdc.${mdccli}.conn_uuid)
mdcpath="mdc.${mdccli}.import=connection=${conn_uuid}"
drop_bl_callback_once "chmod 0777 ${workdir}" &
# test of open reconstruct
test_53() {
touch $DIR/$tfile
- drop_ldlm_reply "openfile -f O_RDWR:O_CREAT -m 0755 $DIR/$tfile" ||\
+ drop_mdt_ldlm_reply "openfile -f O_RDWR:O_CREAT -m 0755 $DIR/$tfile" ||\
return 2
}
run_test 53 "touch: drop rep"
#}
#run_test 62 "Verify connection flags race - bug LU-1716"
+test_65() {
+ mount_client $DIR2
+
+ #grant lock1, export2
+ $SETSTRIPE -i -0 $DIR2/$tfile || return 1
+ $MULTIOP $DIR2/$tfile Ow || return 2
+
+#define OBD_FAIL_LDLM_BL_EVICT 0x31e
+ do_facet ost $LCTL set_param fail_loc=0x31e
+ #get waiting lock2, export1
+ $MULTIOP $DIR/$tfile Ow &
+ PID1=$!
+ # let enqueue to get asleep
+ sleep 2
+
+ #get lock2 blocked
+ $MULTIOP $DIR2/$tfile Ow &
+ PID2=$!
+ sleep 2
+
+ #evict export1
+ ost_evict_client
+
+ sleep 2
+ do_facet ost $LCTL set_param fail_loc=0
+
+ wait $PID1
+ wait $PID2
+
+ umount_client $DIR2
+}
+run_test 65 "lock enqueue for destroyed export"
+
test_66()
{
[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.51) ]] ||
# drop 1 reply with UPDATE lock
mcreate $DIR/$tfile || error "mcreate failed: $?"
- drop_ldlm_reply_once "stat $DIR/$tfile" &
+ drop_mdt_ldlm_reply_once "stat $DIR/$tfile" &
sleep 2
# make the re-sent lock to sleep
do_nodes $list $LCTL set_param fail_loc=0x80000136
#initiate the re-connect & re-send
- local mdccli=$($LCTL dl | awk '/-mdc-/ {print $4;}')
- local conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid)
+ local mdccli=$($LCTL dl | awk '/-MDT0000-mdc-/ {print $4;}')
+ local conn_uuid=$($LCTL get_param -n mdc.${mdccli}.conn_uuid)
$LCTL set_param "mdc.${mdccli}.import=connection=${conn_uuid}"
sleep 2
}
run_test 66 "lock enqueue re-send vs client eviction"
-test_65() {
- mount_client $DIR2
-
- #grant lock1, export2
- $SETSTRIPE -i -0 $DIR2/$tfile || return 1
- $MULTIOP $DIR2/$tfile Ow || return 2
-
-#define OBD_FAIL_LDLM_BL_EVICT 0x31e
- do_facet ost $LCTL set_param fail_loc=0x31e
- #get waiting lock2, export1
- $MULTIOP $DIR/$tfile Ow &
- PID1=$!
- # let enqueue to get asleep
- sleep 2
-
- #get lock2 blocked
- $MULTIOP $DIR2/$tfile Ow &
- PID2=$!
- sleep 2
-
- #evict export1
- ost_evict_client
+test_67()
+{
+#define OBD_FAIL_PTLRPC_CONNECT_RACE 0x531
+ $LCTL set_param fail_loc=0x80000531
+ local mdtname="MDT0000"
+ local mdccli=$($LCTL dl | grep "${mdtname}-mdc" | awk '{print $4;}')
+ local conn_uuid=$($LCTL get_param -n mdc.${mdccli}.mds_conn_uuid)
+ $LCTL set_param "mdc.${mdccli}.import=connection=${conn_uuid}" &
sleep 2
- do_facet ost $LCTL set_param fail_loc=0
- wait $PID1
- wait $PID2
+ mds_evict_client
+ sleep 1
- umount_client $DIR2
+ client_reconnect
+ wait
}
-run_test 65 "lock enqueue for destroyed export"
+run_test 67 "connect vs import invalidate race"
check_cli_ir_state()
{
}
run_test 100 "IR: Make sure normal recovery still works w/o IR"
-test_101()
+test_101a()
{
- do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
- { skip "MGS without IR support"; return 0; }
+ do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
+ skip "MGS without IR support"
- set_ir_status full
+ set_ir_status full
- local OST1_IMP=$(get_osc_import_name client ost1)
+ local ost1_imp=$(get_osc_import_name client ost1)
- # disable pinger recovery
- lctl set_param -n osc.$OST1_IMP.pinger_recov=0
+ # disable pinger recovery
+ lctl set_param -n osc.$ost1_imp.pinger_recov=0
+ stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT
- fail ost1
+ fail ost1
- target_instance_match ost1 || error "instance mismatch"
- nidtbl_versions_match || error "version must match"
+ target_instance_match ost1 || error "instance mismatch"
+ nidtbl_versions_match || error "version must match"
+}
+run_test 101a "IR: Make sure IR works w/o normal recovery"
+
+test_101b()
+{
+ do_facet mgs $LCTL list_param mgs.*.ir_timeout ||
+ skip "MGS without IR support"
+
+ set_ir_status full
+
+ local ost1_imp=$(get_osc_import_name client ost1)
- lctl set_param -n osc.$OST1_IMP.pinger_recov=1
+#define OBD_FAIL_OST_PREPARE_DELAY 0x247
+ do_facet ost1 "$LCTL set_param fail_loc=0x247"
+ # disable pinger recovery
+ $LCTL set_param -n osc.$ost1_imp.pinger_recov=0
+ stack_trap "$LCTL set_param -n osc.$ost1_imp.pinger_recov=1" EXIT
+
+#OST may return EAGAIN if it is not configured yet
+ fail ost1
}
-run_test 101 "IR: Make sure IR works w/o normal recovery"
+run_test 101b "IR: Make sure IR works w/o normal recovery and proceed EAGAIN"
test_102()
{
touch $DIR2/$tfile || error "failed to create empty file"
replay_barrier $SINGLEMDS
- $LCTL set_param debug=console
+ $LCTL set_param debug=ha
$LCTL clear
facet_failover $SINGLEMDS
- # lightweight connection must be evicted
+ # lightweight goes through LUSTRE_IMP_RECOVER during failover
touch -c $DIR2/$tfile || true
$LCTL dk $TMP/lustre-log-$TESTNAME.log
- evicted=`awk '/This client was evicted by .*MDT0000/ {
+ recovered=`awk '/MDT0000-mdc-[0-9a-f]*: lwp recover/ {
print;
}' $TMP/lustre-log-$TESTNAME.log`
- [ -z "$evicted" ] && error "lightweight client not evicted by mds"
+ [ -z "$recovered" ] && error "lightweight client was not recovered"
# and all operations performed by lightweight client should be
# synchronous, so the file created before mds restart should be there
# drop 1 reply with UPDATE lock,
# resend should not create 2nd lock on server
mcreate $DIR/$tfile || error "mcreate failed: $?"
- drop_ldlm_reply_once "stat $DIR/$tfile" || error "stat failed: $?"
+ drop_mdt_ldlm_reply_once "stat $DIR/$tfile" || error "stat failed: $?"
# 2 BL AST will be sent to client, both must find the same lock,
# race them to not get EINVAL for 2nd BL AST
multiop_bg_pause $DIR/$tfile O_jc || return 1
PID=$!
- #define OBD_FAIL_LDLM_REPLY 0x30c
- do_nodes $list $LCTL set_param fail_loc=0x8000030c
+ #define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157
+ do_nodes $list $LCTL set_param fail_loc=0x80000157
kill -USR1 $PID
echo "waiting for multiop $PID"
wait $PID || return 2
}
run_test 134 "race between failover and search for reply data free slot"
+test_135() {
+ [ $MDS1_VERSION -lt $(version_code 2.12.51) ] &&
+ skip "Need MDS version at least 2.12.51"
+
+ mkdir -p $DIR/$tdir
+ $LFS setstripe -E 1M -L mdt $DIR/$tdir
+ # to have parent dir write lock before open/resend
+ touch $DIR/$tdir/$tfile
+ #define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157
+ do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param fail_loc=0x80000157
+ openfile -f O_RDWR:O_CREAT -m 0755 $DIR/$tdir/$tfile ||
+ error "Failed to open DOM file"
+}
+run_test 135 "DOM: open/create resend to return size"
+
+test_136() {
+ remote_mds_nodsh && skip "remote MDS with nodsh" && return
+ [[ $MDS1_VERSION -ge $(version_code 2.12.52) ]] ||
+ skip "Need MDS version at least 2.12.52"
+
+ local mdts=$(comma_list $(mdts_nodes))
+ local MDT0=$(facet_svc $SINGLEMDS)
+
+ local clog=$(do_facet mds1 $LCTL --device $MDT0 changelog_register -n)
+ [ -n "$clog" ] || error "changelog_register failed"
+ cl_mask=$(do_facet mds1 $LCTL get_param \
+ mdd.$MDT0.changelog_mask -n)
+ changelog_chmask "ALL"
+
+ # generate some changelog records to accumulate
+ test_mkdir -i 0 -c 0 $DIR/$tdir || error "mkdir $tdir failed"
+ createmany -m $DIR/$tdir/$tfile 10000 ||
+ error "create $DIR/$tdir/$tfile failed"
+
+ local size1=$(do_facet $SINGLEMDS \
+ $LCTL get_param -n mdd.$MDT0.changelog_size)
+ echo "Changelog size $size1"
+
+ #define OBD_FAIL_LLOG_PURGE_DELAY 0x1318
+ do_nodes $mdts $LCTL set_param fail_loc=0x1318 fail_val=30
+
+ # launch changelog_deregister in background on MDS
+ do_facet mds1 "nohup $LCTL --device $MDT0 changelog_deregister $clog \
+ > foo.out 2> foo.err < /dev/null &"
+ # give time to reach fail_loc
+ sleep 15
+
+ # fail_loc will make MDS sleep in the middle of changelog_deregister
+ # take this opportunity to abruptly kill MDS
+ FAILURE_MODE_save=$FAILURE_MODE
+ FAILURE_MODE=HARD
+ fail mds1
+ FAILURE_MODE=$FAILURE_MODE_save
+
+ do_nodes $mdts $LCTL set_param fail_loc=0x0 fail_val=0
+
+ local size2=$(do_facet $SINGLEMDS \
+ $LCTL get_param -n mdd.$MDT0.changelog_size)
+ echo "Changelog size $size2"
+ local clog2=$(do_facet $SINGLEMDS "$LCTL get_param -n \
+ mdd.$MDT0.changelog_users | grep $clog")
+ echo "After crash, changelog user $clog2"
+
+ [ -n "$clog2" -o $size2 -lt $size1 ] ||
+ error "changelog record count unchanged"
+
+ do_facet mds1 $LCTL set_param mdd.$MDT0.changelog_mask=\'$cl_mask\' -n
+}
+run_test 136 "changelog_deregister leaving pending records"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status