Whamcloud - gitweb
LU-18723 hsm: sanity-hsm 500 hung in llapi_hsm_copytool_recv 84/58084/5
authorSebastien Buisson <sbuisson@ddn.com>
Fri, 14 Feb 2025 09:16:56 +0000 (17:16 +0800)
committerOleg Drokin <green@whamcloud.com>
Thu, 6 Mar 2025 08:06:37 +0000 (08:06 +0000)
sanity-hsm hung in test_500 in llapi_hsm_test test100.
The bug can be reproduced by the following test script:
ONLY="411 500" REFORMAT=yes ./sanity-hsm.sh

The reason is that the previous test case 411 does not cleanup
clearly and failed to unregister the HSM agent due to the
permission under the active rbac role and return -EPERM:
mdt_hsm_ct_unregister() {
...
if (!mdt_hsm_is_admin(info))
GOTO(out, rc = -EPERM);
...

This bug can easily be solved by making sure nodemap is always removed
before the copytool is cleaned up.

Test-Parameters: trivial testlist=sanity-hsm
Signed-off-by: Sebastien Buisson <sbuisson@ddn.com>
Change-Id: I093775eeaf39b4d2671e3a05e41f33a9e1d8ec5e
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58084
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: Robert Read <rread@ddn.com>
Reviewed-by: Robert Read <rread@ddn.com>
Reviewed-by: Li Xi <lixi@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/tests/sanity-hsm.sh

index 162a2ad..2bb5f23 100755 (executable)
@@ -5705,7 +5705,7 @@ run_test 410 "lfs data_version -s allows release of force-archived file"
 cleanup_411() {
        local nm=$1
 
-       do_facet mgs $LCTL nodemap_del $nm
+       do_facet mgs $LCTL nodemap_del $nm || true
        do_facet mgs $LCTL nodemap_activate 0
        wait_nm_sync active
 }
@@ -5763,6 +5763,16 @@ test_411()
        $LFS hsm_clear --exists $tf1 || error "hsm_clear $tf1 failed"
        check_hsm_flags $tf1 "0x00000000"
 
+       # check copytool cleanup works with the hsm_ops rbac role present
+       # and nodemap activated
+       kill_copytools
+       wait_copytools || error "copytool failed to stop"
+
+       copytool setup
+       # Re-add cleanup_411 to the stack to make sure it is always called
+       # before the copytool is cleaned up.
+       stack_trap "cleanup_411 $nm" EXIT
+
        # remove hsm_ops from rbac roles
        roles=$(echo "$roles" | sed 's/hsm_ops,//;s/,hsm_ops//;s/^hsm_ops,//')
        do_facet mgs $LCTL nodemap_modify --name $nm \