From 955e38051765609fe3a661035c0fab2cfca733ce Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Sat, 23 Dec 2023 00:19:02 -0700 Subject: [PATCH] LU-17385 tests: always_except sanity-lfsck/24 Sanity test_24/26a started failing recently due to the landing of new test_23d. Disable test_23d for now to avoid tests failing, but do not remove it so that it is possible to continue debugging it. Add extra debugging to see why this is happening. Test-Parameters: trivial testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Test-Parameters: testlist=sanity-lfsck mdscount=2 mdtcount=4 Fixes: 07e02a600e ("LU-16826 tests: lfsck to repair a dangling remote entry") Signed-off-by: Andreas Dilger Change-Id: Ib6edf1d014ceb6b5d965aadc11272a88e8c001d5 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53544 Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- lustre/tests/sanity-lfsck.sh | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index ff3280a..3e49078 100755 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -15,6 +15,7 @@ init_logging # bug number for skipped test: ALWAYS_EXCEPT="$SANITY_LFSCK_EXCEPT " +always_except LU-17385 23d # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$SLOW" = "no" ] && EXCEPT_SLOW="" @@ -134,12 +135,30 @@ wait_all_targets_blocked() { local com=$1 local status=$2 local err=$3 + # wait to simulate blocked wait, so that we can know the status + local timeout=${4:-600} + local lfsck_query="$LCTL lfsck_query -t $com -M $FSNAME-MDT0000" + + wait_update_facet --quiet mds1 \ + "$lfsck_query | awk '/^${com}_mdts_$status/ { print \\\$2 }'" \ + "$MDSCOUNT" $timeout || { + local mdts=$(comma_list $(mdts_nodes)) + local count=$(do_facet mds1 "$lfsck_query" | + awk '/^${com}_mdts_$status/ { print $2 }') + + do_facet mds1 "$lfsck_query" + echo "==== MDT LOGS ====" + do_nodes $mdts "$LCTL get_param mdd.*.lfsck_$com" + do_nodes $mdts "$LCTL get_param osd*.*.oi_scrub" + if [[ "$com" == "layout" ]]; then + local osts=$(comma_list $(osts_nodes)) + echo "==== OST LOGS ====" + + do_nodes $osts "$LCTL get_param obdfilter.*.lfsck_$com" + do_nodes $osts "$LCTL get_param osd*.*.oi_scrub" + fi + - local count=$(do_facet mds1 \ - "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000 -w | - awk '/^${com}_mdts_${status}/ { print \\\$2 }'") - [[ $count -eq $MDSCOUNT ]] || { - do_facet mds1 "$LCTL lfsck_query -t $com -M ${FSNAME}-MDT0000" error "($err) only $count of $MDSCOUNT MDTs are in ${status}" } } @@ -4096,6 +4115,7 @@ test_23c() { echo "#####" start_full_debug_logging + stack_trap stop_full_debug_logging check_mount_and_prep -- 1.8.3.1