lme_prefer:1;
/* mirror id */
__u16 lme_id;
+ /* preference */
+ __u16 lme_preference;
/* start,end index of this mirror in ldo_comp_entries */
__u16 lme_start;
__u16 lme_end;
int lod_fill_mirrors(struct lod_object *lo)
{
+ struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
struct lod_layout_component *lod_comp;
+ bool found_preferred = false;
int mirror_idx = -1;
__u16 mirror_id = 0xffff;
- int i;
+ int i, pref;
ENTRY;
LASSERT(equi(!lo->ldo_is_composite, lo->ldo_mirror_count == 0));
RETURN(0);
lod_comp = &lo->ldo_comp_entries[0];
+
for (i = 0; i < lo->ldo_comp_cnt; i++, lod_comp++) {
int stale = !!(lod_comp->llc_flags & LCME_FL_STALE);
int preferred = !!(lod_comp->llc_flags & LCME_FL_PREF_WR);
+ int j;
+
+ pref = 0;
+ /* calculate component preference over all used OSTs */
+ for (j = 0; j < lod_comp->llc_stripes_allocated; j++) {
+ int idx = lod_comp->llc_ost_indices[j];
+ struct obd_statfs *osfs = &OST_TGT(lod,idx)->ltd_statfs;
+
+ if (osfs->os_state & OS_STATFS_NONROT)
+ pref++;
+ }
if (mirror_id_of(lod_comp->llc_id) == mirror_id) {
lo->ldo_mirrors[mirror_idx].lme_stale |= stale;
lo->ldo_mirrors[mirror_idx].lme_prefer |= preferred;
+ lo->ldo_mirrors[mirror_idx].lme_preference += pref;
lo->ldo_mirrors[mirror_idx].lme_end = i;
continue;
}
+ if (mirror_idx >= 0 && preferred &&
+ !lo->ldo_mirrors[mirror_idx].lme_stale)
+ found_preferred = true;
+
/* new mirror */
++mirror_idx;
if (mirror_idx >= lo->ldo_mirror_count)
lo->ldo_mirrors[mirror_idx].lme_id = mirror_id;
lo->ldo_mirrors[mirror_idx].lme_stale = stale;
lo->ldo_mirrors[mirror_idx].lme_prefer = preferred;
+ lo->ldo_mirrors[mirror_idx].lme_preference = pref;
lo->ldo_mirrors[mirror_idx].lme_start = i;
lo->ldo_mirrors[mirror_idx].lme_end = i;
}
if (mirror_idx != lo->ldo_mirror_count - 1)
RETURN(-EINVAL);
+ if (!found_preferred && mirror_idx > 0) {
+ int best = -1;
+
+ /*
+ * if no explicited preferred found, then find a mirror
+ * with higher number of non-rotational OSTs
+ * */
+ pref = -1;
+ for (i = 0; i <= mirror_idx; i++) {
+ if (lo->ldo_mirrors[i].lme_stale)
+ continue;
+ if (lo->ldo_mirrors[i].lme_preference > pref) {
+ pref = lo->ldo_mirrors[i].lme_preference;
+ best = i;
+ }
+ }
+
+ LASSERT(best >= 0);
+ lo->ldo_mirrors[best].lme_prefer = 1;
+ }
+
RETURN(0);
}
* cluster.
*/
lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
+
+ rc = lod_fill_mirrors(lo);
+ if (rc)
+ RETURN(rc);
+
for (i = 0; i < lo->ldo_mirror_count; i++) {
bool ost_avail = true;
int index = (i + seq) % lo->ldo_mirror_count;
struct lov_layout_entry {
__u32 lle_type;
unsigned int lle_valid:1;
+ unsigned int lle_preference;
struct lu_extent *lle_extent;
struct lov_stripe_md_entry *lle_lsme;
struct lov_comp_layout_entry_ops *lle_comp_ops;
struct lov_mirror_entry {
unsigned short lre_mirror_id;
- unsigned short lre_preferred:1,
- lre_stale:1, /* set if any components is stale */
+ unsigned short lre_stale:1, /* set if any components is stale */
lre_valid:1, /* set if at least one of components
* in this mirror is valid */
lre_foreign:1; /* set if it is a foreign component */
+ int lre_preference; /* overall preference of this mirror */
unsigned short lre_start; /* index to lo_entries, start index of
* this mirror */
struct cl_device *subdev;
struct lov_oinfo *oinfo = lse->lsme_oinfo[i];
int ost_idx = oinfo->loi_ost_idx;
+ struct obd_export *exp;
if (lov_oinfo_is_dummy(oinfo))
continue;
GOTO(out, result = -EIO);
}
+ exp = dev->ld_lov->lov_tgts[ost_idx]->ltd_exp;
+ if (likely(exp)) {
+ /* the more fast OSTs the better */
+ if (exp->exp_obd->obd_osfs.os_state & OS_STATFS_NONROT)
+ lle->lle_preference++;
+ }
+
subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
subconf->u.coc_oinfo = oinfo;
LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
int result = 0;
unsigned int seq;
- int i, j;
+ int i, j, preference;
bool dom_size = 0;
ENTRY;
lle->lle_lsme = lsm->lsm_entries[i];
lle->lle_type = lov_entry_type(lle->lle_lsme);
+ lle->lle_preference = 0;
switch (lle->lle_type) {
case LOV_PATTERN_RAID0:
lle->lle_comp_ops = &raid0_ops;
/* entries must be sorted by mirrors */
lre->lre_mirror_id = mirror_id;
lre->lre_start = lre->lre_end = i;
- lre->lre_preferred = !!(lle->lle_lsme->lsme_flags &
- LCME_FL_PREF_RD);
+ lre->lre_preference = lle->lle_lsme->lsme_flags &
+ LCME_FL_PREF_RD ? 1000 : 0;
lre->lre_valid = lle->lle_valid;
lre->lre_stale = !lle->lle_valid;
lre->lre_foreign = lsme_is_foreign(lle->lle_lsme);
/* decide the preferred mirror. It uses the hash value of lov_object
* so that different clients would use different mirrors for read. */
mirror_count = 0;
+ preference = -1;
seq = hash_long((unsigned long)lov, 8);
for (i = 0; i < comp->lo_mirror_count; i++) {
unsigned int idx = (i + seq) % comp->lo_mirror_count;
mirror_count++; /* valid mirror */
- if (lre->lre_preferred || comp->lo_preferred_mirror < 0)
+ /* aggregated preference of all involved OSTs */
+ for (j = lre->lre_start; j <= lre->lre_end; j++) {
+ lre->lre_preference +=
+ comp->lo_entries[j].lle_preference;
+ }
+
+ if (lre->lre_preference > preference) {
+ preference = lre->lre_preference;
comp->lo_preferred_mirror = idx;
+ }
}
if (!mirror_count) {
CDEBUG(D_INODE, DFID
}
run_test 207 "create another replica with existing out-of-sync one"
+function check_ost_used() {
+ local ddarg
+ local ost
+ local i
+ local file=$1
+ local io=$2
+
+ shift 2
+
+ cancel_lru_locks osc # to drop pages
+ cancel_lru_locks mdc # to refresh layout
+ # XXX: cancel_lru_locks mdc doesn't work
+ # XXX: need a better way to reload the layout
+ umount_client $MOUNT || error "umount failed"
+ mount_client $MOUNT || error "mount failed"
+
+ # refresh non-rotation status on MDTs
+ sleep 10
+ touch $DIR/$tfile-temp
+ rm -f $DIR/$tfile-temp
+ # refresh non-rotational status on the client
+ $LFS df >&/dev/null
+ sleep 2
+
+ $LCTL set_param osc.*.stats=clear >/dev/null
+ if [[ $io == "read" ]]; then
+ ddarg="if=$file of=/dev/null"
+ elif [[ $io == "write" ]]; then
+ ddarg="if=/dev/zero of=$file"
+ else
+ error "unknown type $io"
+ fi
+ dd $ddarg bs=2M count=1 || error "can't $io $file"
+ cancel_lru_locks osc
+
+ # check only specified OSTs got reads
+ for ((ost = 0; ost < $OSTCOUNT; ost++)); do
+ local nr=$($LCTL get_param -n \
+ osc.$FSNAME-OST000$ost-osc-[-0-9a-f]*.stats |
+ awk "/ost_$io/{print \$2}")
+ nr=${nr:-0}
+ if [[ " $* " =~ " $ost " ]]; then
+ (( nr > 0 )) || error "expected reads on $ost"
+ else
+ (( nr == 0 )) || error "unexpected $nr reads on $ost"
+ fi
+ done
+}
+
+test_208a() {
+ local tf=$DIR/$tfile
+ local osts=$(comma_list $(osts_nodes))
+
+ (( $OSTCOUNT >= 4 )) || skip_env "needs >= 4 OSTs"
+
+ old=$(do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL get_param osd*.*OST*.nonrotational | tr '\n' ' ')
+ stack_trap "do_nodes $osts $LCTL set_param $old"
+
+ stack_trap "rm -f $tf"
+ $LFS setstripe -i0 -c1 $tf || error "can't setstripe"
+ dd if=/dev/zero of=$tf bs=2M count=1 || error "can't dd (1)"
+ $LFS mirror extend -N -c1 -o1 $tf || error "can't create mirror"
+ $LFS mirror extend -N -c2 -o 2,3 $tf || error "can't create mirror"
+ $LFS mirror resync $tf || error "can't resync"
+ $LFS getstripe $tf
+
+ log "set OST0000 non-rotational"
+ do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL set_param osd*.*OST0000*.nonrotational=1
+ check_ost_used $tf read 0
+
+ log "set OST0002 and OST0003 non-rotational, two fast OSTs is better"
+ do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL set_param osd*.*OST0002*.nonrotational=1 \
+ osd*.*OST0003*.nonrotational=1
+ check_ost_used $tf read 2 3
+
+ log "set mirror 1 on OST0001 preferred"
+ $LFS setstripe --comp-set -I 0x20001 --comp-flags=prefer $tf ||
+ error "can't set prefer"
+ check_ost_used $tf read 1
+}
+run_test 208a "mirror selection to prefer non-rotational devices for reads"
+
+test_208b() {
+ local tf=$DIR/$tfile
+ local osts=$(comma_list $(osts_nodes))
+
+ (( $OSTCOUNT >= 4 )) || skip_env "needs >= 4 OSTs"
+
+ old=$(do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL get_param osd*.*OST*.nonrotational | tr '\n' ' ')
+ stack_trap "do_nodes $osts $LCTL set_param $old"
+
+ stack_trap "rm -f $tf"
+ $LFS setstripe -i0 -c1 $tf || error "can't setstripe"
+ dd if=/dev/zero of=$tf bs=2M count=1 || error "can't dd (1)"
+ $LFS mirror extend -N -c1 -o1 $tf || error "can't create mirror"
+ $LFS mirror extend -N -c2 -o 2,3 $tf || error "can't create mirror"
+ $LFS mirror resync $tf || error "can't resync"
+ $LFS getstripe $tf | grep -q flags.*stale && error "still stale"
+
+ log "set OST0000 non-rotational"
+ do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL set_param osd*.*OST0000*.nonrotational=1
+ check_ost_used $tf write 0
+ $LFS mirror resync $tf || error "can't resync"
+
+ log "set OST0002 and OST0003 non-rotational, two fast OSTs is better"
+ do_nodes $(comma_list $(osts_nodes)) \
+ $LCTL set_param osd*.*OST0002*.nonrotational=1 \
+ osd*.*OST0003*.nonrotational=1
+ check_ost_used $tf write 2 3
+ $LFS mirror resync $tf || error "can't resync"
+
+ log "set mirror 1 on OST0001 preferred"
+ $LFS setstripe --comp-set -I 0x20001 --comp-flags=prefer $tf ||
+ error "can't set prefer"
+ check_ost_used $tf write 1
+}
+run_test 208b "mirror selection to prefer non-rotational devices for writes"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status