From aa6250b7412e7baf6760fe4010a81f4f22187127 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Wed, 6 Apr 2022 05:39:27 -0400 Subject: [PATCH] LU-15724 tests: MDT failover hang reproducer The patch adds recovery-small 144a test to reproduce MDT failover hang when precreate threads are blocked on objects. LustreError: 0-0: Forced cleanup waiting for mdt-kjcf05-MDT0001_UUID namespace with 46 resources in use, (rc=-110) Test-Parameters: trivial testlist=recovery-small env=ONLY=144a HPE-bug-id: LUS-10750 Signed-off-by: Alexander Boyko Change-Id: I2743a1b5c8911d6982b527f7e7b7bbbaf310cd04 Reviewed-on: https://review.whamcloud.com/47006 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexey Lyashkov Reviewed-by: Sergey Cheremencev Reviewed-by: Oleg Drokin --- lustre/tests/recovery-small.sh | 45 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 713ccb4..ad27535 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3085,6 +3085,51 @@ test_143() { } run_test 143 "orphan cleanup thread shouldn't be blocked even delete failed" +test_144a() { + [[ $($LCTL get_param mdc.*.import) =~ connect_flags.*overstriping ]] || + skip "server does not support overstriping" + + local pids="" + local setcount=1000 + local mds_timeout + local before + local after + local diff + + large_xattr_enabled || skip_env "ea_inode feature disabled" + test_mkdir -i 0 -c 1 -p $DIR/$tdir + stack_trap "rm -rf $DIR/$tdir" EXIT + + mds_timeout=$(do_facet mds1 $LCTL get_param -n timeout) + do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param timeout=300 + stack_trap "do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param timeout=$mds_timeout" EXIT + + $LFS setstripe -i 0 -C $setcount $DIR/$tdir || error "setstripe failed" + + for (( i = 0; i < 50; i++)); do + touch $DIR/$tdir/$tfile_$i & + pids="$pids $!" + done + + fail ost1 + sleep 60 + + for pid in $pids; do + kill -9 $pid >/dev/null 2>&1 + done + + before=$(date +%s) + fail mds1 + after=$(date +%s) + # here we measure MDT stop + MDT start time. For error case MDT stop takes + # about obd_timeout-60 (240) seconds. Without error - less than 30s. + # MDT start takes different time depends on a configuration, let's check + # the worst. + diff=$((after - before)) + (( $diff < 240 )) || error "MDT failover took $diff seconds" +} +run_test 144a "MDT failover should stop precreation threads" + test_145() { [ $MDSCOUNT -lt 3 ] && skip "needs >= 3 MDTs" [ $(facet_active_host mds2) = $(facet_active_host mds3) ] && -- 1.8.3.1