Whamcloud - gitweb
LU-15724 tests: MDT failover hang reproducer 06/47006/8
authorAlexander Boyko <alexander.boyko@hpe.com>
Wed, 6 Apr 2022 09:39:27 +0000 (05:39 -0400)
committerOleg Drokin <green@whamcloud.com>
Mon, 6 Jun 2022 06:27:34 +0000 (06:27 +0000)
The patch adds recovery-small 144a test to reproduce
MDT failover hang when precreate threads are blocked on objects.

LustreError: 0-0: Forced cleanup waiting for mdt-kjcf05-MDT0001_UUID
namespace with 46 resources in use, (rc=-110)

Test-Parameters: trivial testlist=recovery-small env=ONLY=144a
HPE-bug-id: LUS-10750
Signed-off-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I2743a1b5c8911d6982b527f7e7b7bbbaf310cd04
Reviewed-on: https://review.whamcloud.com/47006
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Sergey Cheremencev <sergey.cheremencev@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/tests/recovery-small.sh

index 713ccb4..ad27535 100755 (executable)
@@ -3085,6 +3085,51 @@ test_143() {
 }
 run_test 143 "orphan cleanup thread shouldn't be blocked even delete failed"
 
+test_144a() {
+       [[ $($LCTL get_param mdc.*.import) =~ connect_flags.*overstriping ]] ||
+               skip "server does not support overstriping"
+
+       local pids=""
+       local setcount=1000
+       local mds_timeout
+       local before
+       local after
+       local diff
+
+       large_xattr_enabled || skip_env "ea_inode feature disabled"
+       test_mkdir -i 0 -c 1 -p $DIR/$tdir
+       stack_trap "rm -rf $DIR/$tdir" EXIT
+
+       mds_timeout=$(do_facet mds1 $LCTL get_param -n timeout)
+       do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param timeout=300
+       stack_trap "do_nodes $(comma_list $(mdts_nodes)) $LCTL set_param timeout=$mds_timeout" EXIT
+
+       $LFS setstripe -i 0 -C $setcount $DIR/$tdir || error "setstripe failed"
+
+       for (( i = 0; i < 50; i++)); do
+               touch $DIR/$tdir/$tfile_$i &
+               pids="$pids $!"
+       done
+
+       fail ost1
+       sleep 60
+
+       for pid in $pids; do
+               kill -9 $pid >/dev/null 2>&1
+       done
+
+       before=$(date +%s)
+       fail mds1
+       after=$(date +%s)
+       # here we measure MDT stop + MDT start time. For error case MDT stop takes
+       # about obd_timeout-60 (240) seconds. Without error - less than 30s.
+       # MDT start takes different time depends on a configuration, let's check
+       # the worst.
+       diff=$((after - before))
+       (( $diff < 240 )) || error "MDT failover took $diff seconds"
+}
+run_test 144a "MDT failover should stop precreation threads"
+
 test_145() {
        [ $MDSCOUNT -lt 3 ] && skip "needs >= 3 MDTs"
        [ $(facet_active_host mds2) = $(facet_active_host mds3) ] &&