Whamcloud - gitweb
LU-17385 tests: always_except sanity-lfsck/24 44/53544/8
authorAndreas Dilger <adilger@whamcloud.com>
Sat, 23 Dec 2023 07:19:02 +0000 (00:19 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 27 Dec 2023 19:25:15 +0000 (19:25 +0000)
Sanity test_24/26a started failing recently due to the landing of
new test_23d.  Disable test_23d for now to avoid tests failing, but
do not remove it so that it is possible to continue debugging it.
Add extra debugging to see why this is happening.

Test-Parameters: trivial testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4
Fixes: 07e02a600e ("LU-16826 tests: lfsck to repair a dangling remote entry")
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: Ib6edf1d014ceb6b5d965aadc11272a88e8c001d5
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53544
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/tests/sanity-lfsck.sh

index ff3280a..3e49078 100755 (executable)
@@ -15,6 +15,7 @@ init_logging
 
 # bug number for skipped test:
 ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT "
+always_except LU-17385 23d
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
@@ -134,12 +135,30 @@ wait_all_targets_blocked() {
        local com=$1
        local status=$2
        local err=$3
+       # wait to simulate blocked wait, so that we can know the status
+       local timeout=${4:-600}
+       local lfsck_query="$LCTL lfsck_query -t $com -M $FSNAME-MDT0000"
+
+       wait_update_facet --quiet mds1 \
+               "$lfsck_query | awk '/^${com}_mdts_$status/ { print \\\$2 }'" \
+               "$MDSCOUNT" $timeout || {
+               local mdts=$(comma_list $(mdts_nodes))
+               local count=$(do_facet mds1 "$lfsck_query" |
+                             awk '/^${com}_mdts_$status/ { print $2 }')
+
+               do_facet mds1 "$lfsck_query"
+               echo "==== MDT LOGS ===="
+               do_nodes $mdts "$LCTL get_param mdd.*.lfsck_$com"
+               do_nodes $mdts "$LCTL get_param osd*.*.oi_scrub"
+               if [[ "$com" == "layout" ]]; then
+                       local osts=$(comma_list $(osts_nodes))
+                       echo "==== OST LOGS ===="
+
+                       do_nodes $osts "$LCTL get_param obdfilter.*.lfsck_$com"
+                       do_nodes $osts "$LCTL get_param osd*.*.oi_scrub"
+               fi
+
 
-       local count=$(do_facet mds1 \
-                    "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w |
-                     awk '/^${com}_mdts_${status}/ { print \\\$2 }'")
-       [[ $count -eq $MDSCOUNT ]] || {
-               do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000"
                error "($err) only $count of $MDSCOUNT MDTs are in ${status}"
        }
 }
@@ -4096,6 +4115,7 @@ test_23c() {
        echo "#####"
 
        start_full_debug_logging
+       stack_trap stop_full_debug_logging
 
        check_mount_and_prep