From 52057d85eaef8c7b5262f0718629fabff919ff1d Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Thu, 23 Jun 2022 09:33:47 -0400 Subject: [PATCH] LU-15393 tests: check QoS hang with OST failover Patch adds recovery small test 152, to reproduce situation where MDT object allocation sleeps on OST failover at lod_ost_alloc_rr under lq_rw_sem read. And all other creation threads hang at lod_ost_alloc_qos at down_write(lq_rw_sem). HPE-bug-id: LUS-10388 Test-Parameters: trivial testlist=recovery-small env=ONLY=152 Signed-off-by: Alexander Boyko Change-Id: I7b9c5a5c9870a559e673a5fd253dcaea40d9fe63 Reviewed-on: https://review.whamcloud.com/47715 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andriy Skulysh Reviewed-by: Vitaly Fertman Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/lod/lod_qos.c | 6 ++++ lustre/tests/recovery-small.sh | 65 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 4da4e56..1da52a7 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -254,6 +254,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 #define OBD_FAIL_MDS_NO_LL_OPEN 0x171 #define OBD_FAIL_MDS_LL_BLOCK 0x172 +#define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x173 /* CMD */ #define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 7f9b156..54e2e08 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -830,6 +830,12 @@ repeat_find: if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0) continue; + if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_LOD_CREATE_PAUSE)) { + clear_bit(LQ_SAME_SPACE, + &m->lod_ost_descs.ltd_qos.lq_flags); + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOD_CREATE_PAUSE, + cfs_fail_val); + } rc = lod_check_and_reserve_ost(env, lo, lod_comp, ost_idx, speed, &stripe_idx, stripe, ost_indices, th, &overstriped, diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 8a0fefd..438b854 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3302,6 +3302,71 @@ test_150() { } run_test 150 "statfs when MDT0 offline with lazystatfs option" +test_152() { + [[ $($LCTL get_param mdc.*.import) =~ connect_flags.*overstriping ]] || + skip "server does not support overstriping" + + local before + local after + local diff + local saved + local version + local pids_rr="" + local pids_qos="" + local setcount=500 + + large_xattr_enabled || skip_env "ea_inode feature disabled" + version=$(do_facet mds1 \ + uname -r | sed -e "s/\([0-9]*\.[0-9]*\.[0-9]*\).*/\1/") + version=$(version_code ${version//\./ }) + if (( $version < $(version_code 4.6.0) )); then + skip "MDS Linux kernel does not support killable semaphore" + fi + + test_mkdir -i 0 -c 1 -p $DIR/$tdir + test_mkdir -i 0 -c 1 -p $DIR/$tdir/rr + test_mkdir -i 0 -c 1 -p $DIR/$tdir/qos + stack_trap "rm -rf $DIR/$tdir" EXIT + + $LFS setstripe -C $setcount $DIR/$tdir/rr/ || error "setstripe failed" + + + #define OBD_FAIL_MDS_LOD_CREATE_PAUSE 0x173 + #Simulate OST failover and sleep RR allocation under lq_rw_sem + do_facet mds1 $LCTL set_param fail_loc=0x80000173 fail_val=20 + before=$(date +%s) + for (( i = 0; i < 2; i++)); do + touch $DIR/$tdir/rr/$tfile_$i & + pids_rr="$pids_rr $!" + done + sleep 3 + + saved=$(do_facet mds1 $LCTL get_param -n lov.*0000*.qos_threshold_rr) + do_facet mds1 $LCTL set_param lov.*.qos_threshold_rr=0 + stack_trap "do_facet mds1 $LCTL set_param lov.*.qos_threshold_rr=$saved" EXIT + + #create files with QoS algo, killable semaphore sleeps for 2seconds + for (( i = 0; i < 3; i++)); do + touch $DIR/$tdir/qos/$tfile_$i & + pids_qos="$pids_qos $!" + done + + for pid in $pids_qos; do + wait $pid + done + after=$(date +%s) + + diff=$((after - before)) + echo "QoS allocation took $diff seconds" + for pid in $pids_rr; do + wait $pid + done + + (( $diff < 20 )) || + error "QoS allocation slower than RR, killable semaphore doesn't work" +} +run_test 152 "QoS object allocation could be awakened in case of OST failover" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1