From: Thomas Bertschinger Date: Tue, 25 Jul 2023 16:03:47 +0000 (-0400) Subject: LU-16981 lod: update llc_stripe_count after ost inactive X-Git-Tag: 2.15.59~135 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=78336aa166f4a7a0128a5891c747eecf26ff9565;p=fs%2Flustre-release.git LU-16981 lod: update llc_stripe_count after ost inactive If an OST gets deactivated while lod_ost_alloc_qos() is trying to allocate stripes for a file create, then normally this is caught and EAGAIN is returned which causes the lod_comp->llc_stripe_count to get updated to accurately reflect the stripe count. But there is a race condition and if the OST is deactivated after the call to ltd_qos_is_usable() but before the stripes are allocated, then updating the stripe count never occurred. This causes an LBUG later in lod_striped_create() because fewer stripes are allocated than the number in llc_stripe_count so it finds a stripe that is NULL. The solution is to properly update lod_comp->llc_stripe_count when the number of stripes created is less than expected. Fixes: ced540165ef5 ("LU-16623 lod: handle object allocation consistently") Test-Parameters: testlist=sanity env=ONLY=27V,ONLY_REPEAT=100 Signed-off-by: Thomas Bertschinger Change-Id: Ia1264f24904fed00454b3bc3c0d6c7b9b947737f Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/51759 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Sergey Cheremencev Reviewed-by: Oleg Drokin Reviewed-by: Andreas Dilger --- diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 69190ba..2244cbb 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -1730,6 +1730,8 @@ static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, set_bit(LQ_DIRTY, &lod->lod_ost_descs.ltd_qos.lq_flags); clear_bit(LQ_SAME_SPACE, &lod->lod_ost_descs.ltd_qos.lq_flags); rc = -EAGAIN; + } else if (nfound < lod_comp->llc_stripe_count) { + lod_comp->llc_stripe_count = nfound; } /* If there are enough OSTs, a component with overstriping requessted diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 2623127..f3dc745 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3505,6 +3505,45 @@ test_27U() { } run_test 27U "append pool and stripe count work with composite default layout" +test_27V() { + [ $PARALLEL == "yes" ] && skip "skip parallel run" + (( $OSTCOUNT >= 4 )) || skip_env "needs >= 4 OSTs" + + local dir=$DIR/$tdir + local osp_param=osp.$FSNAME-OST0000-osc-MDT0000.max_create_count + local lod_param=lod.$FSNAME-MDT0000-mdtlov.qos_threshold_rr + local saved_max=$(do_facet mds1 $LCTL get_param -n $osp_param) + local saved_qos=$(do_facet mds1 $LCTL get_param -n $lod_param) + local pid + + stack_trap "do_facet mds1 $LCTL set_param $osp_param=$saved_max" + + do_facet mds1 $LCTL set_param $lod_param=0 + stack_trap "do_facet mds1 $LCTL set_param $lod_param=$saved_qos" + + $LFS setdirstripe --mdt-count=1 --mdt-index=0 $dir + stack_trap "rm -rf $dir" + + # exercise race in LU-16981 with deactivating OST while creating a file + ( + while true; do + do_facet mds1 $LCTL set_param $osp_param=0 > /dev/null + sleep 0.1 + do_facet mds1 \ + $LCTL set_param $osp_param=$saved_max > /dev/null + done + ) & + + pid=$! + stack_trap "kill -9 $pid" + + # errors here are OK so ignore them (just don't want to crash) + $LFS setstripe -c -1 $dir/f.{1..200} 2> /dev/null + + return 0 +} +run_test 27V "creating widely striped file races with deactivating OST" + # createtest also checks that device nodes are created and # then visible correctly (#2091) test_28() { # bug 2091