From: Bobi Jam Date: Tue, 26 Dec 2017 10:16:40 +0000 (+0800) Subject: LU-10448 lod: pick primary mirror for write X-Git-Tag: 2.10.59~120 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=6b373937774d0b3736d02b260be9a81c7eff5351;p=fs%2Flustre-release.git LU-10448 lod: pick primary mirror for write As a mirrored file being written for the first time, MDS will choose a mirror to write the data, a primary choosing policy function is defined in this patch (lod_primary_pick()) to avoid the mirror with unavailable OSTs. Signed-off-by: Bobi Jam Change-Id: I5d6d0459e96583294c3040a7994c33114be1e439 Reviewed-on: https://review.whamcloud.com/30711 Reviewed-by: Jinshan Xiong Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Fan Yong Reviewed-by: Oleg Drokin --- diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index 9cab1e3..022f7a9 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -703,6 +703,7 @@ __u16 lod_comp_entry_stripe_count(struct lod_object *lo, bool is_dir); __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, __u16 stripe_count); +void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod); /* lproc_lod.c */ int lod_procfs_init(struct lod_device *lod); diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index 9aff034..7a5697e 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -5574,6 +5574,149 @@ static void lod_stale_components(struct lod_object *lo, int primary, } } +/** + * check an OST's availability + * \param[in] env execution environment + * \param[in] lo lod object + * \param[in] dt dt object + * \param[in] index mirror index + * + * \retval negative if failed + * \retval 1 if \a dt is available + * \retval 0 if \a dt is not available + */ +static inline int lod_check_ost_avail(const struct lu_env *env, + struct lod_object *lo, + struct dt_object *dt, int index) +{ + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + struct lod_tgt_desc *ost; + __u32 idx; + int type = LU_SEQ_RANGE_OST; + int rc; + + rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type); + if (rc < 0) { + CERROR("%s: can't locate "DFID":rc = %d\n", + lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)), + rc); + return rc; + } + + ost = OST_TGT(lod, idx); + if (ost->ltd_statfs.os_state & + (OS_STATE_READONLY | OS_STATE_ENOSPC | OS_STATE_ENOINO) || + ost->ltd_active == 0) { + CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n", + PFID(lod_object_fid(lo)), index, idx, rc); + return 0; + } + + return 1; +} + +/** + * Pick primary mirror for write + * \param[in] env execution environment + * \param[in] lo object + * \param[in] extent write range + */ +static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo, + struct lu_extent *extent) +{ + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + unsigned int seq = 0; + struct lod_layout_component *lod_comp; + int i, j, rc; + int picked = -1, second_pick = -1, third_pick = -1; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) { + get_random_bytes(&seq, sizeof(seq)); + seq %= lo->ldo_mirror_count; + } + + /** + * Pick a mirror as the primary, and check the availability of OSTs. + * + * This algo can be revised later after knowing the topology of + * cluster. + */ + lod_qos_statfs_update(env, lod); + for (i = 0; i < lo->ldo_mirror_count; i++) { + bool ost_avail = true; + int index = (i + seq) % lo->ldo_mirror_count; + + if (lo->ldo_mirrors[index].lme_stale) { + CDEBUG(D_LAYOUT, DFID": mirror %d stale\n", + PFID(lod_object_fid(lo)), index); + continue; + } + + /* 2nd pick is for the primary mirror containing unavail OST */ + if (lo->ldo_mirrors[index].lme_primary && second_pick < 0) + second_pick = index; + + /* 3rd pick is for non-primary mirror containing unavail OST */ + if (second_pick < 0 && third_pick < 0) + third_pick = index; + + /** + * we found a non-primary 1st pick, we'd like to find a + * potential pirmary mirror. + */ + if (picked >= 0 && !lo->ldo_mirrors[index].lme_primary) + continue; + + /* check the availability of OSTs */ + lod_foreach_mirror_comp(lod_comp, lo, index) { + if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe) + continue; + + for (j = 0; j < lod_comp->llc_stripe_count; j++) { + struct dt_object *dt = lod_comp->llc_stripe[j]; + + rc = lod_check_ost_avail(env, lo, dt, index); + if (rc < 0) + RETURN(rc); + + ost_avail = !!rc; + if (!ost_avail) + break; + } /* for all dt object in one component */ + if (!ost_avail) + break; + } /* for all components in a mirror */ + + /** + * the OSTs where allocated objects locates in the components + * of the mirror are available. + */ + if (!ost_avail) + continue; + + /* this mirror has all OSTs available */ + picked = index; + + /** + * primary with all OSTs are available, this is the perfect + * 1st pick. + */ + if (lo->ldo_mirrors[index].lme_primary) + break; + } /* for all mirrors */ + + /* failed to pick a sound mirror, lower our expectation */ + if (picked < 0) + picked = second_pick; + if (picked < 0) + picked = third_pick; + if (picked < 0) + RETURN(-ENODATA); + + RETURN(picked); +} + static int lod_declare_update_rdonly(const struct lu_env *env, struct lod_object *lo, struct md_layout_change *mlc, struct thandle *th) @@ -5583,9 +5726,7 @@ static int lod_declare_update_rdonly(const struct lu_env *env, struct lod_layout_component *lod_comp; struct layout_intent *layout = mlc->mlc_intent; struct lu_extent extent = layout->li_extent; - unsigned int seq = 0; int picked; - int i; int rc; ENTRY; @@ -5596,33 +5737,9 @@ static int lod_declare_update_rdonly(const struct lu_env *env, CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n", PFID(lod_object_fid(lo)), PEXT(&extent)); - if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) { - get_random_bytes(&seq, sizeof(seq)); - seq %= lo->ldo_mirror_count; - } - - /** - * Pick a mirror as the primary. - * Now it only picks the first mirror that has primary flag set and - * doesn't have any stale components. This algo should be revised - * later after knowing the topology of cluster or the availability of - * OSTs. - */ - for (picked = -1, i = 0; i < lo->ldo_mirror_count; i++) { - int index = (i + seq) % lo->ldo_mirror_count; - - if (!lo->ldo_mirrors[index].lme_stale) { - if (lo->ldo_mirrors[index].lme_primary) { - picked = index; - break; - } - - if (picked < 0) - picked = index; - } - } - if (picked < 0) /* failed to pick a primary */ - RETURN(-ENODATA); + picked = lod_primary_pick(env, lo, &extent); + if (picked < 0) + RETURN(picked); CDEBUG(D_LAYOUT, DFID": picked mirror %u as primary\n", PFID(lod_object_fid(lo)), lo->ldo_mirrors[picked].lme_id); @@ -5654,9 +5771,6 @@ static int lod_declare_update_rdonly(const struct lu_env *env, if (lod_comp_inited(lod_comp)) continue; - CDEBUG(D_LAYOUT, "instantiate: %u / %u\n", - i, lod_comp_index(lo, lod_comp)); - info->lti_comp_idx[info->lti_count++] = lod_comp_index(lo, lod_comp); } diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 7101f54..a5b4717 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -259,8 +259,7 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, * \param[in] env execution environment for this thread * \param[in] lod LOD device */ -static void lod_qos_statfs_update(const struct lu_env *env, - struct lod_device *lod) +void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod) { struct obd_device *obd = lod2obd(lod); struct ost_pool *osts = &(lod->lod_pool_info); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index 5eaf52c..4201887 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -1340,8 +1340,9 @@ test_40() { for ops in "conv=notrunc" ""; do rm -f $tf - $LFS mirror create -N -E2m -E4m -E-1 -N -E1m -E2m -E4m -E-1 \ - $tf || error "create PFLR file $tf failed" + $LFS mirror create -N -E2m -E4m -E-1 --flags=prefer \ + -N -E1m -E2m -E4m -E-1 $tf || + error "create PFLR file $tf failed" dd if=/dev/zero of=$tf $ops bs=1M seek=2 count=1 || error "write PFLR file $tf failed" @@ -1358,7 +1359,7 @@ test_40() { # the 2nd component (in mirror 1) should be inited verify_comp_attr lcme_flags $tf 0x10002 init # the 3rd component (in mirror 1) should be uninited - verify_comp_attr lcme_flags $tf 0x10003 0 + verify_comp_attr lcme_flags $tf 0x10003 prefer # the 4th component (in mirror 2) should be inited verify_comp_attr lcme_flags $tf 0x20004 init # the 5th component (in mirror 2) should be uninited @@ -1517,6 +1518,77 @@ test_42() { } run_test 42 "lfs mirror verify" +# inactivate one OST && write && restore the OST +write_file_43() { + local file=$1 + local ost=$2 + local PARAM="osc.${FSNAME}-OST000${ost}-osc-M*.active" + local wait + + wait=$(do_facet $SINGLEMDS \ + "$LCTL get_param -n lod.*MDT0000-*.qos_maxage") + wait=${wait%%[^0-9]*} + + echo "deactivate OST$ost, waiting for $((wait*2)) seconds" + $(do_facet $SINGLEMDS "$LCTL set_param -n $PARAM 0") + # lod_qos_statfs_update needs 2*$wait seconds to refresh targets statfs + sleep $(($wait * 2)) + echo "write $file" + dd if=/dev/zero of=$file bs=1M count=1 || error "write $file failed" + echo "restore activating OST$ost, waiting for $((wait*2)) seconds" + $(do_facet $SINGLEMDS "$LCTL set_param -n $PARAM 1") + sleep $((wait * 2)) + + local flags=$($LFS getstripe -v $file | awk '/lcm_flags:/ { print $2 }') + [ $flags = wp ] || error "file mirror state $flags != wp" +} + +test_43() { + [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return + + local tf=$DIR/$tfile + local flags + + rm -f $tf + ## mirror 0 ost (0, 1) + ## mirror 1 ost (1, 2) + ## mirror 2 ost (2, 0) + $LFS mirror create -N -Eeof -c2 -o0,1 -N -Eeof -c2 -o1,2 \ + -N -Eeof -c2 -o2,0 $tf || + error "create 3 mirrors file $tf failed" + + ################## OST0 ########################################### + write_file_43 $tf 0 + verify_comp_attr lcme_flags $tf 0x10001 init,stale + verify_comp_attr lcme_flags $tf 0x20002 init + verify_comp_attr lcme_flags $tf 0x30003 init,stale + + # resync + echo "resync $tf" + $LFS mirror resync $tf + flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }') + [ $flags = ro ] || error "file mirror state $flags != ro" + + ################## OST1 ########################################### + write_file_43 $tf 1 + verify_comp_attr lcme_flags $tf 0x10001 init,stale + verify_comp_attr lcme_flags $tf 0x20002 init,stale + verify_comp_attr lcme_flags $tf 0x30003 init + + # resync + echo "resync $tf" + $LFS mirror resync $tf + flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }') + [ $flags = ro ] || error "file mirror state $flags != ro" + + ################## OST2 ########################################### + write_file_43 $tf 2 + verify_comp_attr lcme_flags $tf 0x10001 init + verify_comp_attr lcme_flags $tf 0x20002 init,stale + verify_comp_attr lcme_flags $tf 0x30003 init,stale +} +run_test 43 "mirror pick on write" + test_44() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return rm -rf $DIR/$tdir