From 8507472dd37ebc07bf7eb1b772c2ff619009c233 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Thu, 9 Sep 2021 11:16:41 +0300 Subject: [PATCH] LU-14996 lov: prefer mirrors on non-rotational OSTs consider non-rotational OSTs as preferred unless explicit prefer flag is set on a mirror. Signed-off-by: Alex Zhuravlev Change-Id: I787bcba0b5e45842c9d4762c7f97a8f44a4fc9cb Reviewed-on: https://review.whamcloud.com/44883 Tested-by: jenkins Tested-by: Maloo Reviewed-by: John L. Hammond Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/lod/lod_internal.h | 2 + lustre/lod/lod_lov.c | 43 ++++++++++++++- lustre/lod/lod_object.c | 5 ++ lustre/lov/lov_cl_internal.h | 5 +- lustre/lov/lov_object.c | 26 +++++++-- lustre/tests/sanity-flr.sh | 123 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 197 insertions(+), 7 deletions(-) diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index bd050bd..0e41743 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -224,6 +224,8 @@ struct lod_mirror_entry { lme_prefer:1; /* mirror id */ __u16 lme_id; + /* preference */ + __u16 lme_preference; /* start,end index of this mirror in ldo_comp_entries */ __u16 lme_start; __u16 lme_end; diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index 10ab3fa..ae80072 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -589,10 +589,12 @@ int lod_alloc_comp_entries(struct lod_object *lo, int lod_fill_mirrors(struct lod_object *lo) { + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); struct lod_layout_component *lod_comp; + bool found_preferred = false; int mirror_idx = -1; __u16 mirror_id = 0xffff; - int i; + int i, pref; ENTRY; LASSERT(equi(!lo->ldo_is_composite, lo->ldo_mirror_count == 0)); @@ -601,17 +603,34 @@ int lod_fill_mirrors(struct lod_object *lo) RETURN(0); lod_comp = &lo->ldo_comp_entries[0]; + for (i = 0; i < lo->ldo_comp_cnt; i++, lod_comp++) { int stale = !!(lod_comp->llc_flags & LCME_FL_STALE); int preferred = !!(lod_comp->llc_flags & LCME_FL_PREF_WR); + int j; + + pref = 0; + /* calculate component preference over all used OSTs */ + for (j = 0; j < lod_comp->llc_stripes_allocated; j++) { + int idx = lod_comp->llc_ost_indices[j]; + struct obd_statfs *osfs = &OST_TGT(lod,idx)->ltd_statfs; + + if (osfs->os_state & OS_STATFS_NONROT) + pref++; + } if (mirror_id_of(lod_comp->llc_id) == mirror_id) { lo->ldo_mirrors[mirror_idx].lme_stale |= stale; lo->ldo_mirrors[mirror_idx].lme_prefer |= preferred; + lo->ldo_mirrors[mirror_idx].lme_preference += pref; lo->ldo_mirrors[mirror_idx].lme_end = i; continue; } + if (mirror_idx >= 0 && preferred && + !lo->ldo_mirrors[mirror_idx].lme_stale) + found_preferred = true; + /* new mirror */ ++mirror_idx; if (mirror_idx >= lo->ldo_mirror_count) @@ -622,12 +641,34 @@ int lod_fill_mirrors(struct lod_object *lo) lo->ldo_mirrors[mirror_idx].lme_id = mirror_id; lo->ldo_mirrors[mirror_idx].lme_stale = stale; lo->ldo_mirrors[mirror_idx].lme_prefer = preferred; + lo->ldo_mirrors[mirror_idx].lme_preference = pref; lo->ldo_mirrors[mirror_idx].lme_start = i; lo->ldo_mirrors[mirror_idx].lme_end = i; } if (mirror_idx != lo->ldo_mirror_count - 1) RETURN(-EINVAL); + if (!found_preferred && mirror_idx > 0) { + int best = -1; + + /* + * if no explicited preferred found, then find a mirror + * with higher number of non-rotational OSTs + * */ + pref = -1; + for (i = 0; i <= mirror_idx; i++) { + if (lo->ldo_mirrors[i].lme_stale) + continue; + if (lo->ldo_mirrors[i].lme_preference > pref) { + pref = lo->ldo_mirrors[i].lme_preference; + best = i; + } + } + + LASSERT(best >= 0); + lo->ldo_mirrors[best].lme_prefer = 1; + } + RETURN(0); } diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index ea57141..5729d2f 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -7448,6 +7448,11 @@ static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo, * cluster. */ lod_qos_statfs_update(env, lod, &lod->lod_ost_descs); + + rc = lod_fill_mirrors(lo); + if (rc) + RETURN(rc); + for (i = 0; i < lo->ldo_mirror_count; i++) { bool ost_avail = true; int index = (i + seq) % lo->ldo_mirror_count; diff --git a/lustre/lov/lov_cl_internal.h b/lustre/lov/lov_cl_internal.h index 284b4a4..40a5836 100644 --- a/lustre/lov/lov_cl_internal.h +++ b/lustre/lov/lov_cl_internal.h @@ -225,6 +225,7 @@ struct lov_layout_dom { struct lov_layout_entry { __u32 lle_type; unsigned int lle_valid:1; + unsigned int lle_preference; struct lu_extent *lle_extent; struct lov_stripe_md_entry *lle_lsme; struct lov_comp_layout_entry_ops *lle_comp_ops; @@ -236,11 +237,11 @@ struct lov_layout_entry { struct lov_mirror_entry { unsigned short lre_mirror_id; - unsigned short lre_preferred:1, - lre_stale:1, /* set if any components is stale */ + unsigned short lre_stale:1, /* set if any components is stale */ lre_valid:1, /* set if at least one of components * in this mirror is valid */ lre_foreign:1; /* set if it is a foreign component */ + int lre_preference; /* overall preference of this mirror */ unsigned short lre_start; /* index to lo_entries, start index of * this mirror */ diff --git a/lustre/lov/lov_object.c b/lustre/lov/lov_object.c index 3da2c49..f8a7522 100644 --- a/lustre/lov/lov_object.c +++ b/lustre/lov/lov_object.c @@ -232,6 +232,7 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev, struct cl_device *subdev; struct lov_oinfo *oinfo = lse->lsme_oinfo[i]; int ost_idx = oinfo->loi_ost_idx; + struct obd_export *exp; if (lov_oinfo_is_dummy(oinfo)) continue; @@ -246,6 +247,13 @@ static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev, GOTO(out, result = -EIO); } + exp = dev->ld_lov->lov_tgts[ost_idx]->ltd_exp; + if (likely(exp)) { + /* the more fast OSTs the better */ + if (exp->exp_obd->obd_osfs.os_state & OS_STATFS_NONROT) + lle->lle_preference++; + } + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); subconf->u.coc_oinfo = oinfo; LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx); @@ -624,7 +632,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK; int result = 0; unsigned int seq; - int i, j; + int i, j, preference; bool dom_size = 0; ENTRY; @@ -663,6 +671,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, lle->lle_lsme = lsm->lsm_entries[i]; lle->lle_type = lov_entry_type(lle->lle_lsme); + lle->lle_preference = 0; switch (lle->lle_type) { case LOV_PATTERN_RAID0: lle->lle_comp_ops = &raid0_ops; @@ -723,8 +732,8 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, /* entries must be sorted by mirrors */ lre->lre_mirror_id = mirror_id; lre->lre_start = lre->lre_end = i; - lre->lre_preferred = !!(lle->lle_lsme->lsme_flags & - LCME_FL_PREF_RD); + lre->lre_preference = lle->lle_lsme->lsme_flags & + LCME_FL_PREF_RD ? 1000 : 0; lre->lre_valid = lle->lle_valid; lre->lre_stale = !lle->lle_valid; lre->lre_foreign = lsme_is_foreign(lle->lle_lsme); @@ -769,6 +778,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, /* decide the preferred mirror. It uses the hash value of lov_object * so that different clients would use different mirrors for read. */ mirror_count = 0; + preference = -1; seq = hash_long((unsigned long)lov, 8); for (i = 0; i < comp->lo_mirror_count; i++) { unsigned int idx = (i + seq) % comp->lo_mirror_count; @@ -782,8 +792,16 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev, mirror_count++; /* valid mirror */ - if (lre->lre_preferred || comp->lo_preferred_mirror < 0) + /* aggregated preference of all involved OSTs */ + for (j = lre->lre_start; j <= lre->lre_end; j++) { + lre->lre_preference += + comp->lo_entries[j].lle_preference; + } + + if (lre->lre_preference > preference) { + preference = lre->lre_preference; comp->lo_preferred_mirror = idx; + } } if (!mirror_count) { CDEBUG(D_INODE, DFID diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index a6dc15d..d9f677e 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -3699,6 +3699,129 @@ test_207() { } run_test 207 "create another replica with existing out-of-sync one" +function check_ost_used() { + local ddarg + local ost + local i + local file=$1 + local io=$2 + + shift 2 + + cancel_lru_locks osc # to drop pages + cancel_lru_locks mdc # to refresh layout + # XXX: cancel_lru_locks mdc doesn't work + # XXX: need a better way to reload the layout + umount_client $MOUNT || error "umount failed" + mount_client $MOUNT || error "mount failed" + + # refresh non-rotation status on MDTs + sleep 10 + touch $DIR/$tfile-temp + rm -f $DIR/$tfile-temp + # refresh non-rotational status on the client + $LFS df >&/dev/null + sleep 2 + + $LCTL set_param osc.*.stats=clear >/dev/null + if [[ $io == "read" ]]; then + ddarg="if=$file of=/dev/null" + elif [[ $io == "write" ]]; then + ddarg="if=/dev/zero of=$file" + else + error "unknown type $io" + fi + dd $ddarg bs=2M count=1 || error "can't $io $file" + cancel_lru_locks osc + + # check only specified OSTs got reads + for ((ost = 0; ost < $OSTCOUNT; ost++)); do + local nr=$($LCTL get_param -n \ + osc.$FSNAME-OST000$ost-osc-[-0-9a-f]*.stats | + awk "/ost_$io/{print \$2}") + nr=${nr:-0} + if [[ " $* " =~ " $ost " ]]; then + (( nr > 0 )) || error "expected reads on $ost" + else + (( nr == 0 )) || error "unexpected $nr reads on $ost" + fi + done +} + +test_208a() { + local tf=$DIR/$tfile + local osts=$(comma_list $(osts_nodes)) + + (( $OSTCOUNT >= 4 )) || skip_env "needs >= 4 OSTs" + + old=$(do_nodes $(comma_list $(osts_nodes)) \ + $LCTL get_param osd*.*OST*.nonrotational | tr '\n' ' ') + stack_trap "do_nodes $osts $LCTL set_param $old" + + stack_trap "rm -f $tf" + $LFS setstripe -i0 -c1 $tf || error "can't setstripe" + dd if=/dev/zero of=$tf bs=2M count=1 || error "can't dd (1)" + $LFS mirror extend -N -c1 -o1 $tf || error "can't create mirror" + $LFS mirror extend -N -c2 -o 2,3 $tf || error "can't create mirror" + $LFS mirror resync $tf || error "can't resync" + $LFS getstripe $tf + + log "set OST0000 non-rotational" + do_nodes $(comma_list $(osts_nodes)) \ + $LCTL set_param osd*.*OST0000*.nonrotational=1 + check_ost_used $tf read 0 + + log "set OST0002 and OST0003 non-rotational, two fast OSTs is better" + do_nodes $(comma_list $(osts_nodes)) \ + $LCTL set_param osd*.*OST0002*.nonrotational=1 \ + osd*.*OST0003*.nonrotational=1 + check_ost_used $tf read 2 3 + + log "set mirror 1 on OST0001 preferred" + $LFS setstripe --comp-set -I 0x20001 --comp-flags=prefer $tf || + error "can't set prefer" + check_ost_used $tf read 1 +} +run_test 208a "mirror selection to prefer non-rotational devices for reads" + +test_208b() { + local tf=$DIR/$tfile + local osts=$(comma_list $(osts_nodes)) + + (( $OSTCOUNT >= 4 )) || skip_env "needs >= 4 OSTs" + + old=$(do_nodes $(comma_list $(osts_nodes)) \ + $LCTL get_param osd*.*OST*.nonrotational | tr '\n' ' ') + stack_trap "do_nodes $osts $LCTL set_param $old" + + stack_trap "rm -f $tf" + $LFS setstripe -i0 -c1 $tf || error "can't setstripe" + dd if=/dev/zero of=$tf bs=2M count=1 || error "can't dd (1)" + $LFS mirror extend -N -c1 -o1 $tf || error "can't create mirror" + $LFS mirror extend -N -c2 -o 2,3 $tf || error "can't create mirror" + $LFS mirror resync $tf || error "can't resync" + $LFS getstripe $tf | grep -q flags.*stale && error "still stale" + + log "set OST0000 non-rotational" + do_nodes $(comma_list $(osts_nodes)) \ + $LCTL set_param osd*.*OST0000*.nonrotational=1 + check_ost_used $tf write 0 + $LFS mirror resync $tf || error "can't resync" + + log "set OST0002 and OST0003 non-rotational, two fast OSTs is better" + do_nodes $(comma_list $(osts_nodes)) \ + $LCTL set_param osd*.*OST0002*.nonrotational=1 \ + osd*.*OST0003*.nonrotational=1 + check_ost_used $tf write 2 3 + $LFS mirror resync $tf || error "can't resync" + + log "set mirror 1 on OST0001 preferred" + $LFS setstripe --comp-set -I 0x20001 --comp-flags=prefer $tf || + error "can't set prefer" + check_ost_used $tf write 1 +} +run_test 208b "mirror selection to prefer non-rotational devices for writes" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1