Whamcloud - gitweb
LU-18392 tests: hold group locks in recovery-small/160 46/56846/4
authorLi Dongyang <dongyangli@ddn.com>
Fri, 1 Nov 2024 06:55:36 +0000 (17:55 +1100)
committerOleg Drokin <green@whamcloud.com>
Mon, 9 Dec 2024 06:13:32 +0000 (06:13 +0000)
recovery-small/160 starts 10 threads and holding group locks
for 20s.
The osp destroy retries could actually happen on ofd after the 20s
delay, and make those rpc waiting for OST commit while increase
destroys_in_flight.

Make sure we hold on to the group locks while checking for
destroys_in_flight, and add additional check to make sure
the object destroy actually done after releasing the group lock.

Do not cast the transno to 32bit unsigned in osp_sync_interpret(),
which could produce a confusing transno in debug log.

Test-Parameters: trivial testlist=recovery-small env=ONLY=160,ONLY_REPEAT=100
Fixes: 27f787daa7 ("LU-15737 ofd: don't block destroys")
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
Change-Id: I2a6ac9700a3e79e9930cee905c1da73da948ba1a
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56846
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/osp/osp_sync.c
lustre/tests/recovery-small.sh

index 140f9ae..d56c55c 100644 (file)
@@ -556,9 +556,8 @@ static int osp_sync_interpret(const struct lu_env *env,
        }
        LASSERT(d);
 
-       CDEBUG(D_HA, "reply req %p/%d, rc %d, transno %u\n", req,
-              atomic_read(&req->rq_refcount),
-              rc, (unsigned) req->rq_transno);
+       CDEBUG(D_HA, "reply req %p/%d, rc %d, transno %llu\n", req,
+              atomic_read(&req->rq_refcount), rc, req->rq_transno);
 
        if (rc == -ENOENT) {
                /*
index 882935a..55e54ae 100755 (executable)
@@ -3726,8 +3726,8 @@ test_160() {
 
        for ((i = 1; i <= threads; i++)); do
                local file=$DIR/$tdir/file_$i
-               #open/group lock/write/unlink/pause 20s/group unlock/close
-               $MULTIOP $file OG1234w10240u_20g1234c &
+               #open/group lock/write/unlink/pause/group unlock/close
+               $MULTIOP $file OG1234w10240u_g1234c &
                pids[$i]=$!
        done
        sleep 2
@@ -3745,10 +3745,19 @@ test_160() {
        do_facet mds1 $LCTL get_param osp.$FSNAME-OST0000-osc-MDT0000.error_list
        echo inflight $rc
        for ((i = 1; i <= threads; i++)); do
-               wait ${pids[$i]}
+               kill -USR1 ${pids[$i]} && wait ${pids[$i]}
        done
 
        (( $rc <= 2 )) || error "destroying OST objects are blocked $rc"
+
+       #without group lock, wait and check if all objects are destroyed
+       sleep $((timeout * 3))
+       do_facet mds1 $LCTL get_param osp.$FSNAME-OST0000-osc-MDT0000.error_list
+       local errs=$(do_facet mds1 $LCTL get_param -n osp.$FSNAME-OST0000-osc-MDT0000.error_list | wc -l)
+       rc=$(do_facet mds1 $LCTL get_param -n osp.$FSNAME-OST0000-osc-MDT0000.destroys_in_flight)
+
+       (( $errs == 0 )) || error "error_list not empty"
+       (( $rc == 0 )) || error "$rc destroys in flight"
 }
 run_test 160 "MDT destroys are blocked by grouplocks"