From: Bobi Jam Date: Fri, 24 May 2019 17:40:25 +0000 (+0800) Subject: LU-12328 flr: avoid reading unhealthy mirror X-Git-Tag: 2.12.90~29 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=39da3c06275e04e2a6e7f055cb27ee9dff1ea576;ds=sidebyside LU-12328 flr: avoid reading unhealthy mirror * Fix an error in lov_io_mirror_init() which would wait unnecessarily if we're retrying the last mirror of the file. * In osc_io_iter_init() we'd check its OSC import status so that the read path can quickly switch another mirror. sanity-flr test_33b is added to test this case. * And with all mirrors have been tried, we'd turn off the quick switch so that when all mirrors contain bad OSTs, the read will still try its best to get partial data from a component before trying another mirror. sanity-flr test_33c is added to test this case. Test-Parameters: envdefinitions=ONLY="33" testlist=sanity-flr,sanity-flr,sanity-flr,sanity-flr,sanity-flr,sanity-flr,sanity-flr,sanity-flr,sanity-flr,sanity-flr Fixes: 5a6ceb664f07 ("LU-7236 ptlrpc: idle connections can disconnect") Signed-off-by: Bobi Jam Change-Id: I5621a834e58ee1bfccf6c407d2c68357b5c3eb3b Reviewed-on: https://review.whamcloud.com/34952 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Wang Shilong Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index b475955..7dc9b10 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1915,7 +1915,13 @@ struct cl_io { /** * Set if IO is triggered by async workqueue readahead. */ - ci_async_readahead:1; + ci_async_readahead:1, + /** + * Set if we've tried all mirrors for this read IO, if it's not set, + * the read IO will check to-be-read OSCs' status, and make fast-switch + * another mirror if some of the OSTs are not healthy. + */ + ci_tried_all_mirrors:1; /** * How many times the read has retried before this one. * Set by the top level and consumed by the LOV. diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c index 464c8f2..92b8ce5 100644 --- a/lustre/lov/lov_io.c +++ b/lustre/lov/lov_io.c @@ -142,6 +142,7 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, sub_io->ci_lock_no_expand = io->ci_lock_no_expand; sub_io->ci_ndelay = io->ci_ndelay; sub_io->ci_layout_version = io->ci_layout_version; + sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors; result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj); @@ -405,13 +406,13 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, found = true; break; } - } - + } /* each component of the mirror */ if (found) { index = (index + i) % comp->lo_mirror_count; break; } - } + } /* each mirror */ + if (i == comp->lo_mirror_count) { CERROR(DFID": failed to find a component covering " "I/O region at %llu\n", @@ -435,16 +436,22 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, * of this client has been partitioned. We should relinquish CPU for * a while before trying again. */ - ++io->ci_ndelay_tried; - if (io->ci_ndelay && io->ci_ndelay_tried >= comp->lo_mirror_count) { + if (io->ci_ndelay && io->ci_ndelay_tried > 0 && + (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) { set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(cfs_time_seconds(1)); /* 10ms */ + schedule_timeout(cfs_time_seconds(1) / 100); /* 10ms */ if (signal_pending(current)) RETURN(-EINTR); - /* reset retry counter */ - io->ci_ndelay_tried = 1; + /** + * we'd set ci_tried_all_mirrors to turn off fast mirror + * switching for read after we've tried all mirrors several + * rounds. + */ + io->ci_tried_all_mirrors = io->ci_ndelay_tried % + (comp->lo_mirror_count * 4) == 0; } + ++io->ci_ndelay_tried; CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n", io->ci_ndelay ? "non-" : ""); @@ -682,6 +689,7 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio, case CIT_READ: case CIT_WRITE: { io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); + io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors; if (cl_io_is_append(parent)) { io->u.ci_wr.wr_append = 1; } else { diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c index b4a196f..03c0e50 100644 --- a/lustre/osc/osc_io.c +++ b/lustre/osc/osc_io.c @@ -368,15 +368,30 @@ int osc_io_commit_async(const struct lu_env *env, } EXPORT_SYMBOL(osc_io_commit_async); +static bool osc_import_not_healthy(struct obd_import *imp) +{ + return imp->imp_invalid || imp->imp_deactive || + !(imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_IDLE); +} + int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) { struct osc_object *osc = cl2osc(ios->cis_obj); struct obd_import *imp = osc_cli(osc)->cl_import; struct osc_io *oio = osc_env_io(env); int rc = -EIO; + ENTRY; spin_lock(&imp->imp_lock); - if (likely(!imp->imp_invalid)) { + /** + * check whether this OSC device is available for non-delay read, + * fast switching mirror if we haven't tried all mirrors. + */ + if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay && + !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) { + rc = -EWOULDBLOCK; + } else if (likely(!imp->imp_invalid)) { atomic_inc(&osc->oo_nr_ios); oio->oi_is_active = 1; rc = 0; @@ -386,7 +401,7 @@ int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios) if (cfs_capable(CFS_CAP_SYS_RESOURCE)) oio->oi_cap_sys_resource = 1; - return rc; + RETURN(rc); } EXPORT_SYMBOL(osc_io_iter_init); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index 37c439e..3df762b 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -273,6 +273,16 @@ verify_comp_attrs() { verify_comp_attr_with_parent pool $tf $comp_id } +verify_flr_state() +{ + local tf=$1 + local expected_state=$2 + + local state=$($LFS getstripe -v $tf | awk '/lcm_flags/{ print $2 }') + [ $expected_state = $state ] || + error "expected: $expected_state, actual $state" +} + # command line test cases test_0a() { local td=$DIR/$tdir @@ -1040,7 +1050,7 @@ test_32() { } run_test 32 "data should be mirrored to newly created mirror" -test_33() { +test_33a() { [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return rm -f $DIR/$tfile $DIR/$tfile-2 @@ -1114,7 +1124,114 @@ test_33() { start_osts 2 } -run_test 33 "read can choose available mirror to read" +run_test 33a "read can choose available mirror to read" + +test_33b() { + [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return + + rm -f $DIR/$tfile + + stack_trap "rm -f $DIR/$tfile" EXIT + + # create a file with two mirrors on OST0000 and OST0001 + $LFS setstripe -N -Eeof -o0 -N -Eeof -o1 $DIR/$tfile + + # make sure that $tfile has two mirrors + [ $($LFS getstripe -N $DIR/$tfile) -eq 2 ] || + { $LFS getstripe $DIR/$tfile; error "expected count 2"; } + + # write 50M + dd if=/dev/urandom of=$DIR/$tfile bs=2M count=25 || + error "write failed for $DIR/$tfile" + $LFS mirror resync $DIR/$tfile || error "resync failed for $DIR/$tfile" + verify_flr_state $DIR/$tfile "ro" + drop_client_cache + + ls -l $DIR/$tfile + + # read file - all OSTs are available + echo "reading file (data can be provided by any ost)... " + local t1=$SECONDS + time cat $DIR/$tfile > /dev/null || error "read all" + local t2=$SECONDS + ra=$((t2 - t1)) + + # read file again with ost1 {OST0000} failed + stop_osts 1 + drop_client_cache + echo "reading file (data should be provided by ost2)..." + t1=$SECONDS + time cat $DIR/$tfile > /dev/null || error "read ost2" + t2=$SECONDS + r1=$((t2 - t1)) + + # remount ost1 + start_osts 1 + + # read file again with ost2 {OST0001} failed + stop_osts 2 + drop_client_cache + + echo "reading file (data should be provided by ost1)..." + t1=$SECONDS + time cat $DIR/$tfile > /dev/null || error "read ost1" + t2=$SECONDS + r2=$((t2 - t1)) + + # remount ost2 + start_osts 2 + + [ $((r1 * 100)) -gt $((ra * 105)) -a $r1 -gt $((ra + 2)) ] && + error "read mirror too slow without ost1, from $ra to $r1" + [ $((r2 * 100)) -gt $((ra * 105)) -a $r2 -gt $((ra + 2)) ] && + error "read mirror too slow without ost2, from $ra to $r2" + + wait_osc_import_ready client ost2 +} +run_test 33b "avoid reading from unhealthy mirror" + +test_33c() { + [[ $OSTCOUNT -lt 3 ]] && skip "need >= 3 OSTs" && return + + rm -f $DIR/$tfile + + stack_trap "rm -f $DIR/$tfile" EXIT + + # create a file with two mirrors + # mirror1: {OST0000, OST0001} + # mirror2: {OST0001, OST0002} + $LFS setstripe -N -Eeof -c2 -o0,1 -N -Eeof -c2 -o1,2 $DIR/$tfile + + # make sure that $tfile has two mirrors + [ $($LFS getstripe -N $DIR/$tfile) -eq 2 ] || + { $LFS getstripe $DIR/$tfile; error "expected count 2"; } + + # write 50M + dd if=/dev/urandom of=$DIR/$tfile bs=2M count=25 || + error "write failed for $DIR/$tfile" + $LFS mirror resync $DIR/$tfile || error "resync failed for $DIR/$tfile" + verify_flr_state $DIR/$tfile "ro" + drop_client_cache + + ls -l $DIR/$tfile + + # read file - all OSTs are available + echo "reading file (data can be provided by any ost)... " + time cat $DIR/$tfile > /dev/null || error "read all" + + # read file again with ost2 (OST0001) failed + stop_osts 2 + drop_client_cache + + echo "reading file (data should be provided by ost1 and ost3)..." + time cat $DIR/$tfile > /dev/null || error "read ost1 & ost3" + + # remount ost2 + start_osts 2 + + wait_osc_import_ready client ost2 +} +run_test 33c "keep reading among unhealthy mirrors" test_34a() { [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return @@ -1376,16 +1493,6 @@ test_37() } run_test 37 "mirror I/O API verification" -verify_flr_state() -{ - local tf=$1 - local expected_state=$2 - - local state=$($LFS getstripe -v $tf | awk '/lcm_flags/{ print $2 }') - [ $expected_state = $state ] || - error "expected: $expected_state, actual $state" -} - test_38() { local tf=$DIR/$tfile local ref=$DIR/${tfile}-ref