}
}
+/**
+ * check an OST's availability
+ * \param[in] env execution environment
+ * \param[in] lo lod object
+ * \param[in] dt dt object
+ * \param[in] index mirror index
+ *
+ * \retval negative if failed
+ * \retval 1 if \a dt is available
+ * \retval 0 if \a dt is not available
+ */
+static inline int lod_check_ost_avail(const struct lu_env *env,
+ struct lod_object *lo,
+ struct dt_object *dt, int index)
+{
+ struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+ struct lod_tgt_desc *ost;
+ __u32 idx;
+ int type = LU_SEQ_RANGE_OST;
+ int rc;
+
+ rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
+ if (rc < 0) {
+ CERROR("%s: can't locate "DFID":rc = %d\n",
+ lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
+ rc);
+ return rc;
+ }
+
+ ost = OST_TGT(lod, idx);
+ if (ost->ltd_statfs.os_state &
+ (OS_STATE_READONLY | OS_STATE_ENOSPC | OS_STATE_ENOINO) ||
+ ost->ltd_active == 0) {
+ CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n",
+ PFID(lod_object_fid(lo)), index, idx, rc);
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
+ * Pick primary mirror for write
+ * \param[in] env execution environment
+ * \param[in] lo object
+ * \param[in] extent write range
+ */
+static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
+ struct lu_extent *extent)
+{
+ struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+ unsigned int seq = 0;
+ struct lod_layout_component *lod_comp;
+ int i, j, rc;
+ int picked = -1, second_pick = -1, third_pick = -1;
+ ENTRY;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
+ get_random_bytes(&seq, sizeof(seq));
+ seq %= lo->ldo_mirror_count;
+ }
+
+ /**
+ * Pick a mirror as the primary, and check the availability of OSTs.
+ *
+ * This algo can be revised later after knowing the topology of
+ * cluster.
+ */
+ lod_qos_statfs_update(env, lod);
+ for (i = 0; i < lo->ldo_mirror_count; i++) {
+ bool ost_avail = true;
+ int index = (i + seq) % lo->ldo_mirror_count;
+
+ if (lo->ldo_mirrors[index].lme_stale) {
+ CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
+ PFID(lod_object_fid(lo)), index);
+ continue;
+ }
+
+ /* 2nd pick is for the primary mirror containing unavail OST */
+ if (lo->ldo_mirrors[index].lme_primary && second_pick < 0)
+ second_pick = index;
+
+ /* 3rd pick is for non-primary mirror containing unavail OST */
+ if (second_pick < 0 && third_pick < 0)
+ third_pick = index;
+
+ /**
+ * we found a non-primary 1st pick, we'd like to find a
+ * potential pirmary mirror.
+ */
+ if (picked >= 0 && !lo->ldo_mirrors[index].lme_primary)
+ continue;
+
+ /* check the availability of OSTs */
+ lod_foreach_mirror_comp(lod_comp, lo, index) {
+ if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
+ continue;
+
+ for (j = 0; j < lod_comp->llc_stripe_count; j++) {
+ struct dt_object *dt = lod_comp->llc_stripe[j];
+
+ rc = lod_check_ost_avail(env, lo, dt, index);
+ if (rc < 0)
+ RETURN(rc);
+
+ ost_avail = !!rc;
+ if (!ost_avail)
+ break;
+ } /* for all dt object in one component */
+ if (!ost_avail)
+ break;
+ } /* for all components in a mirror */
+
+ /**
+ * the OSTs where allocated objects locates in the components
+ * of the mirror are available.
+ */
+ if (!ost_avail)
+ continue;
+
+ /* this mirror has all OSTs available */
+ picked = index;
+
+ /**
+ * primary with all OSTs are available, this is the perfect
+ * 1st pick.
+ */
+ if (lo->ldo_mirrors[index].lme_primary)
+ break;
+ } /* for all mirrors */
+
+ /* failed to pick a sound mirror, lower our expectation */
+ if (picked < 0)
+ picked = second_pick;
+ if (picked < 0)
+ picked = third_pick;
+ if (picked < 0)
+ RETURN(-ENODATA);
+
+ RETURN(picked);
+}
+
static int lod_declare_update_rdonly(const struct lu_env *env,
struct lod_object *lo, struct md_layout_change *mlc,
struct thandle *th)
struct lod_layout_component *lod_comp;
struct layout_intent *layout = mlc->mlc_intent;
struct lu_extent extent = layout->li_extent;
- unsigned int seq = 0;
int picked;
- int i;
int rc;
ENTRY;
CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
PFID(lod_object_fid(lo)), PEXT(&extent));
- if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
- get_random_bytes(&seq, sizeof(seq));
- seq %= lo->ldo_mirror_count;
- }
-
- /**
- * Pick a mirror as the primary.
- * Now it only picks the first mirror that has primary flag set and
- * doesn't have any stale components. This algo should be revised
- * later after knowing the topology of cluster or the availability of
- * OSTs.
- */
- for (picked = -1, i = 0; i < lo->ldo_mirror_count; i++) {
- int index = (i + seq) % lo->ldo_mirror_count;
-
- if (!lo->ldo_mirrors[index].lme_stale) {
- if (lo->ldo_mirrors[index].lme_primary) {
- picked = index;
- break;
- }
-
- if (picked < 0)
- picked = index;
- }
- }
- if (picked < 0) /* failed to pick a primary */
- RETURN(-ENODATA);
+ picked = lod_primary_pick(env, lo, &extent);
+ if (picked < 0)
+ RETURN(picked);
CDEBUG(D_LAYOUT, DFID": picked mirror %u as primary\n",
PFID(lod_object_fid(lo)), lo->ldo_mirrors[picked].lme_id);
if (lod_comp_inited(lod_comp))
continue;
- CDEBUG(D_LAYOUT, "instantiate: %u / %u\n",
- i, lod_comp_index(lo, lod_comp));
-
info->lti_comp_idx[info->lti_count++] =
lod_comp_index(lo, lod_comp);
}
for ops in "conv=notrunc" ""; do
rm -f $tf
- $LFS mirror create -N -E2m -E4m -E-1 -N -E1m -E2m -E4m -E-1 \
- $tf || error "create PFLR file $tf failed"
+ $LFS mirror create -N -E2m -E4m -E-1 --flags=prefer \
+ -N -E1m -E2m -E4m -E-1 $tf ||
+ error "create PFLR file $tf failed"
dd if=/dev/zero of=$tf $ops bs=1M seek=2 count=1 ||
error "write PFLR file $tf failed"
# the 2nd component (in mirror 1) should be inited
verify_comp_attr lcme_flags $tf 0x10002 init
# the 3rd component (in mirror 1) should be uninited
- verify_comp_attr lcme_flags $tf 0x10003 0
+ verify_comp_attr lcme_flags $tf 0x10003 prefer
# the 4th component (in mirror 2) should be inited
verify_comp_attr lcme_flags $tf 0x20004 init
# the 5th component (in mirror 2) should be uninited
}
run_test 42 "lfs mirror verify"
+# inactivate one OST && write && restore the OST
+write_file_43() {
+ local file=$1
+ local ost=$2
+ local PARAM="osc.${FSNAME}-OST000${ost}-osc-M*.active"
+ local wait
+
+ wait=$(do_facet $SINGLEMDS \
+ "$LCTL get_param -n lod.*MDT0000-*.qos_maxage")
+ wait=${wait%%[^0-9]*}
+
+ echo "deactivate OST$ost, waiting for $((wait*2)) seconds"
+ $(do_facet $SINGLEMDS "$LCTL set_param -n $PARAM 0")
+ # lod_qos_statfs_update needs 2*$wait seconds to refresh targets statfs
+ sleep $(($wait * 2))
+ echo "write $file"
+ dd if=/dev/zero of=$file bs=1M count=1 || error "write $file failed"
+ echo "restore activating OST$ost, waiting for $((wait*2)) seconds"
+ $(do_facet $SINGLEMDS "$LCTL set_param -n $PARAM 1")
+ sleep $((wait * 2))
+
+ local flags=$($LFS getstripe -v $file | awk '/lcm_flags:/ { print $2 }')
+ [ $flags = wp ] || error "file mirror state $flags != wp"
+}
+
+test_43() {
+ [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
+
+ local tf=$DIR/$tfile
+ local flags
+
+ rm -f $tf
+ ## mirror 0 ost (0, 1)
+ ## mirror 1 ost (1, 2)
+ ## mirror 2 ost (2, 0)
+ $LFS mirror create -N -Eeof -c2 -o0,1 -N -Eeof -c2 -o1,2 \
+ -N -Eeof -c2 -o2,0 $tf ||
+ error "create 3 mirrors file $tf failed"
+
+ ################## OST0 ###########################################
+ write_file_43 $tf 0
+ verify_comp_attr lcme_flags $tf 0x10001 init,stale
+ verify_comp_attr lcme_flags $tf 0x20002 init
+ verify_comp_attr lcme_flags $tf 0x30003 init,stale
+
+ # resync
+ echo "resync $tf"
+ $LFS mirror resync $tf
+ flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }')
+ [ $flags = ro ] || error "file mirror state $flags != ro"
+
+ ################## OST1 ###########################################
+ write_file_43 $tf 1
+ verify_comp_attr lcme_flags $tf 0x10001 init,stale
+ verify_comp_attr lcme_flags $tf 0x20002 init,stale
+ verify_comp_attr lcme_flags $tf 0x30003 init
+
+ # resync
+ echo "resync $tf"
+ $LFS mirror resync $tf
+ flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }')
+ [ $flags = ro ] || error "file mirror state $flags != ro"
+
+ ################## OST2 ###########################################
+ write_file_43 $tf 2
+ verify_comp_attr lcme_flags $tf 0x10001 init
+ verify_comp_attr lcme_flags $tf 0x20002 init,stale
+ verify_comp_attr lcme_flags $tf 0x30003 init,stale
+}
+run_test 43 "mirror pick on write"
+
test_44() {
[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
rm -rf $DIR/$tdir