Whamcloud - gitweb
LU-14027 tests: Fix test_135 of replay-single 27/41227/12
authorEtienne AUJAMES <eaujames@ddn.com>
Thu, 14 Jan 2021 16:58:40 +0000 (17:58 +0100)
committerOleg Drokin <green@whamcloud.com>
Wed, 25 Oct 2023 17:45:07 +0000 (17:45 +0000)
The test_135 ("Server failure in lock replay phase") of replay-single
has an error on the method to get the pid of a background process.

Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Fixes: 7ca495ec67 ("LU-14027 ldlm: Do not hang if recovery restarted during lock replay")
Test-Parameters: trivial
Test-Parameters: testlist=replay-single env=ONLY=135,ONLY_REPEAT=50
Test-Parameters: testlist=replay-single env=ONLY_REPEAT=20,ONLY=135,FAILURE_MODE=HARD clientcount=4 mdtcount=1 mdscount=2 osscount=2 austeroptions=-R failover=true iscsi=1 testlist=replay-single
Change-Id: I6ed41d75f4cbba796e39288bad8895ee1c24459f
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/41227
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/tests/replay-single.sh

index 58fd571..9dbc21c 100755 (executable)
@@ -4982,6 +4982,9 @@ run_test 134 "replay creation of a file created in a pool"
 
 # LU-14027
 test_135() {
+       local PID
+       local old_replay
+
        # make sure we are using the primary server
        [[ $(facet_active ost1) == "ost1" ]] || fail ost1
 
@@ -4990,31 +4993,34 @@ test_135() {
        # All files to ost1
        $LFS setstripe -S $((128 * 1024)) -i 0 $DIR/$tdir
 
+       # Init all the clients connections (write lastrcv on the OST)
+       clients_up
        replay_barrier ost1
 
+       old_replay=$($LCTL get_param ldlm.cancel_unused_locks_before_replay)
+       $LCTL set_param ldlm.cancel_unused_locks_before_replay=0
+       stack_trap "$LCTL set_param $old_replay" EXIT
+
        # Create 20 files so we have 20 ost locks
        for i in $(seq 20) ; do
-               echo blah > $DIR/$tdir/file.${i}
+               echo blah > $DIR/$tdir/file.${i} & PID+="$! "
        done
+       wait $PID
 
-       shutdown_facet ost1
-       reboot_facet ost1
+       stop ost1
        change_active ost1
        wait_for_facet ost1
 
-       #define OBD_FAIL_TGT_REPLAY_RECONNECT     0x32d
+       #define OBD_FAIL_LDLM_LOCK_REPLAY       0x32d
        # Make sure lock replay server side never completes and errors out.
        do_rpc_nodes $(facet_active_host ost1) \
                load_module ../libcfs/libcfs/libcfs
-       do_facet ost1 "$LCTL set_param fail_val=20"
-       do_facet ost1 "$LCTL set_param fail_loc=0x32d"
-
+       do_facet ost1 "$LCTL set_param fail_loc=0x32d fail_val=20"
        mount_facet ost1
 
        # Now make sure we notice
-       (sync;sync;sync) &
-       local PID=$?
-       sleep 20 # should we do something proactive to make reconnects go?
+       (sync;sync;sync;echo "End of sync") & PID=$!
+       wait_clients_import_state ${HOSTNAME} ost1 REPLAY_LOCKS
        kill -0 $PID || error "Unexpected sync success"
 
        shutdown_facet ost1
@@ -5029,6 +5035,8 @@ test_135() {
        unmountoss
        mountoss
        clients_up || clients_up || error "$LFS df $MOUNT failed"
+
+       wait $PID || error "Fail to sync"
        echo blah > $DIR/$tdir/file.test2
 
        rm -rf $DIR/$tdir