struct lod_default_striping *lds = &info->lti_def_striping;
struct lov_user_md_v1 *v1 = buf->lb_buf;
char pool[LOV_MAXPOOLNAME + 1];
+ bool is_del;
/* get existing striping config */
rc = lod_get_default_lov_striping(env, lod_dt_obj(dt), lds);
lds->lds_def_comp_cnt, pool,
sizeof(pool));
+ is_del = LOVEA_DELETE_VALUES(v1->lmm_stripe_size,
+ v1->lmm_stripe_count,
+ v1->lmm_stripe_offset,
+ NULL);
+
/* Retain the pool name if it is not given */
- if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0') {
+ if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0' &&
+ !is_del) {
struct lod_thread_info *info = lod_env_info(env);
struct lov_user_md_v3 *v3 = info->lti_ea_store;
}
}
-static int lod_declare_update_rdonly(const struct lu_env *env,
- struct lod_object *lo, struct md_layout_change *mlc,
- struct thandle *th)
+/**
+ * check an OST's availability
+ * \param[in] env execution environment
+ * \param[in] lo lod object
+ * \param[in] dt dt object
+ * \param[in] index mirror index
+ *
+ * \retval negative if failed
+ * \retval 1 if \a dt is available
+ * \retval 0 if \a dt is not available
+ */
+static inline int lod_check_ost_avail(const struct lu_env *env,
+ struct lod_object *lo,
+ struct dt_object *dt, int index)
{
- struct lod_thread_info *info = lod_env_info(env);
- struct lu_attr *layout_attr = &info->lti_layout_attr;
- struct lod_layout_component *lod_comp;
- struct layout_intent *layout = mlc->mlc_intent;
- struct lu_extent extent = layout->li_extent;
- unsigned int seq = 0;
- int picked;
- int i;
+ struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+ struct lod_tgt_desc *ost;
+ __u32 idx;
+ int type = LU_SEQ_RANGE_OST;
int rc;
- ENTRY;
- LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE);
- LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
- LASSERT(lo->ldo_mirror_count > 0);
+ rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
+ if (rc < 0) {
+ CERROR("%s: can't locate "DFID":rc = %d\n",
+ lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
+ rc);
+ return rc;
+ }
+
+ ost = OST_TGT(lod, idx);
+ if (ost->ltd_statfs.os_state &
+ (OS_STATE_READONLY | OS_STATE_ENOSPC | OS_STATE_ENOINO) ||
+ ost->ltd_active == 0) {
+ CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n",
+ PFID(lod_object_fid(lo)), index, idx, rc);
+ return 0;
+ }
+
+ return 1;
+}
- CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
- PFID(lod_object_fid(lo)), PEXT(&extent));
+/**
+ * Pick primary mirror for write
+ * \param[in] env execution environment
+ * \param[in] lo object
+ * \param[in] extent write range
+ */
+static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
+ struct lu_extent *extent)
+{
+ struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+ unsigned int seq = 0;
+ struct lod_layout_component *lod_comp;
+ int i, j, rc;
+ int picked = -1, second_pick = -1, third_pick = -1;
+ ENTRY;
if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
get_random_bytes(&seq, sizeof(seq));
}
/**
- * Pick a mirror as the primary.
- * Now it only picks the first mirror that has primary flag set and
- * doesn't have any stale components. This algo should be revised
- * later after knowing the topology of cluster or the availability of
- * OSTs.
+ * Pick a mirror as the primary, and check the availability of OSTs.
+ *
+ * This algo can be revised later after knowing the topology of
+ * cluster.
*/
- for (picked = -1, i = 0; i < lo->ldo_mirror_count; i++) {
+ lod_qos_statfs_update(env, lod);
+ for (i = 0; i < lo->ldo_mirror_count; i++) {
+ bool ost_avail = true;
int index = (i + seq) % lo->ldo_mirror_count;
- if (!lo->ldo_mirrors[index].lme_stale) {
- if (lo->ldo_mirrors[index].lme_primary) {
- picked = index;
- break;
- }
-
- if (picked < 0)
- picked = index;
+ if (lo->ldo_mirrors[index].lme_stale) {
+ CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
+ PFID(lod_object_fid(lo)), index);
+ continue;
}
- }
- if (picked < 0) /* failed to pick a primary */
- RETURN(-ENODATA);
- CDEBUG(D_LAYOUT, DFID": picked mirror %u as primary\n",
- PFID(lod_object_fid(lo)), lo->ldo_mirrors[picked].lme_id);
+ /* 2nd pick is for the primary mirror containing unavail OST */
+ if (lo->ldo_mirrors[index].lme_primary && second_pick < 0)
+ second_pick = index;
+
+ /* 3rd pick is for non-primary mirror containing unavail OST */
+ if (second_pick < 0 && third_pick < 0)
+ third_pick = index;
- if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
/**
- * trunc transfers [0, size) in the intent extent, we'd
- * stale components overlapping [size, eof).
+ * we found a non-primary 1st pick, we'd like to find a
+ * potential pirmary mirror.
*/
- extent.e_start = extent.e_end;
- extent.e_end = OBD_OBJECT_EOF;
- }
+ if (picked >= 0 && !lo->ldo_mirrors[index].lme_primary)
+ continue;
+
+ /* check the availability of OSTs */
+ lod_foreach_mirror_comp(lod_comp, lo, index) {
+ if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
+ continue;
- /* stale overlapping components from other mirrors */
- lod_stale_components(lo, picked, &extent);
+ for (j = 0; j < lod_comp->llc_stripe_count; j++) {
+ struct dt_object *dt = lod_comp->llc_stripe[j];
- /* restore truncate intent extent */
- if (layout->li_opc == LAYOUT_INTENT_TRUNC)
- extent.e_end = extent.e_start;
+ rc = lod_check_ost_avail(env, lo, dt, index);
+ if (rc < 0)
+ RETURN(rc);
+
+ ost_avail = !!rc;
+ if (!ost_avail)
+ break;
+ } /* for all dt object in one component */
+ if (!ost_avail)
+ break;
+ } /* for all components in a mirror */
+
+ /**
+ * the OSTs where allocated objects locates in the components
+ * of the mirror are available.
+ */
+ if (!ost_avail)
+ continue;
- /* instantiate components for the picked mirror, start from 0 */
- extent.e_start = 0;
+ /* this mirror has all OSTs available */
+ picked = index;
- lod_foreach_mirror_comp(lod_comp, lo, picked) {
- if (!lu_extent_is_overlapped(&extent,
- &lod_comp->llc_extent))
+ /**
+ * primary with all OSTs are available, this is the perfect
+ * 1st pick.
+ */
+ if (lo->ldo_mirrors[index].lme_primary)
break;
+ } /* for all mirrors */
+
+ /* failed to pick a sound mirror, lower our expectation */
+ if (picked < 0)
+ picked = second_pick;
+ if (picked < 0)
+ picked = third_pick;
+ if (picked < 0)
+ RETURN(-ENODATA);
- if (lod_comp_inited(lod_comp))
+ RETURN(picked);
+}
+
+/**
+ * figure out the components should be instantiated for resync.
+ */
+static int lod_prepare_resync(const struct lu_env *env, struct lod_object *lo,
+ struct lu_extent *extent)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lod_layout_component *lod_comp;
+ unsigned int need_sync = 0;
+ int i;
+
+ CDEBUG(D_LAYOUT,
+ DFID": instantiate all stale components in "DEXT"\n",
+ PFID(lod_object_fid(lo)), PEXT(extent));
+
+ /**
+ * instantiate all components within this extent, even non-stale
+ * components.
+ */
+ for (i = 0; i < lo->ldo_mirror_count; i++) {
+ if (!lo->ldo_mirrors[i].lme_stale)
continue;
- CDEBUG(D_LAYOUT, "instantiate: %u / %u\n",
- i, lod_comp_index(lo, lod_comp));
+ lod_foreach_mirror_comp(lod_comp, lo, i) {
+ if (!lu_extent_is_overlapped(extent,
+ &lod_comp->llc_extent))
+ break;
- info->lti_comp_idx[info->lti_count++] =
- lod_comp_index(lo, lod_comp);
+ need_sync++;
+
+ if (lod_comp_inited(lod_comp))
+ continue;
+
+ CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
+ i, lod_comp_index(lo, lod_comp));
+ info->lti_comp_idx[info->lti_count++] =
+ lod_comp_index(lo, lod_comp);
+ }
}
- lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+ return need_sync ? 0 : -EALREADY;
+}
+
+static int lod_declare_update_rdonly(const struct lu_env *env,
+ struct lod_object *lo, struct md_layout_change *mlc,
+ struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lu_attr *layout_attr = &info->lti_layout_attr;
+ struct lod_layout_component *lod_comp;
+ struct lu_extent extent = { 0 };
+ int rc;
+ ENTRY;
+
+ LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
+ LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
+ mlc->mlc_opc == MD_LAYOUT_RESYNC);
+ LASSERT(lo->ldo_mirror_count > 0);
+
+ if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
+ struct layout_intent *layout = mlc->mlc_intent;
+ int picked;
+
+ extent = layout->li_extent;
+ CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
+ PFID(lod_object_fid(lo)), PEXT(&extent));
+
+ picked = lod_primary_pick(env, lo, &extent);
+ if (picked < 0)
+ RETURN(picked);
+
+ CDEBUG(D_LAYOUT, DFID": picked mirror id %u as primary\n",
+ PFID(lod_object_fid(lo)),
+ lo->ldo_mirrors[picked].lme_id);
+
+ if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
+ /**
+ * trunc transfers [0, size) in the intent extent, we'd
+ * stale components overlapping [size, eof).
+ */
+ extent.e_start = extent.e_end;
+ extent.e_end = OBD_OBJECT_EOF;
+ }
+
+ /* stale overlapping components from other mirrors */
+ lod_stale_components(lo, picked, &extent);
+
+ /* restore truncate intent extent */
+ if (layout->li_opc == LAYOUT_INTENT_TRUNC)
+ extent.e_end = extent.e_start;
+
+ /* instantiate components for the picked mirror, start from 0 */
+ extent.e_start = 0;
+
+ lod_foreach_mirror_comp(lod_comp, lo, picked) {
+ if (!lu_extent_is_overlapped(&extent,
+ &lod_comp->llc_extent))
+ break;
+
+ if (lod_comp_inited(lod_comp))
+ continue;
+
+ info->lti_comp_idx[info->lti_count++] =
+ lod_comp_index(lo, lod_comp);
+ }
+
+ lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+ } else { /* MD_LAYOUT_RESYNC */
+ int i;
+
+ /**
+ * could contain multiple non-stale mirrors, so we need to
+ * prep uninited all components assuming any non-stale mirror
+ * could be picked as the primary mirror.
+ */
+ for (i = 0; i < lo->ldo_mirror_count; i++) {
+ if (lo->ldo_mirrors[i].lme_stale)
+ continue;
+
+ lod_foreach_mirror_comp(lod_comp, lo, i) {
+ if (!lod_comp_inited(lod_comp))
+ break;
+
+ if (extent.e_end < lod_comp->llc_extent.e_end)
+ extent.e_end =
+ lod_comp->llc_extent.e_end;
+ }
+ }
+
+ rc = lod_prepare_resync(env, lo, &extent);
+ if (rc)
+ GOTO(out, rc);
+ /* change the file state to SYNC_PENDING */
+ lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
+ }
/* Reset the layout version once it's becoming too large.
* This way it can make sure that the layout version is
layout_attr->la_valid = LA_LAYOUT_VERSION;
layout_attr->la_layout_version = 0; /* set current version */
+ if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
+ layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
if (rc)
GOTO(out, rc);
lod_comp_index(lo, lod_comp);
}
} else { /* MD_LAYOUT_RESYNC */
- /* figure out the components that have been instantiated in
- * in primary to decide what components should be instantiated
- * in stale mirrors */
lod_foreach_mirror_comp(lod_comp, lo, primary) {
if (!lod_comp_inited(lod_comp))
break;
extent.e_end = lod_comp->llc_extent.e_end;
}
- CDEBUG(D_LAYOUT,
- DFID": instantiate all stale components in "DEXT"\n",
- PFID(lod_object_fid(lo)), PEXT(&extent));
-
- /* 1. instantiate all components within this extent, even
- * non-stale components so that it won't need to instantiate
- * those components for mirror truncate later. */
- for (i = 0; i < lo->ldo_mirror_count; i++) {
- if (primary == i)
- continue;
-
- LASSERTF(lo->ldo_mirrors[i].lme_stale,
- "both %d and %d are primary\n", i, primary);
-
- lod_foreach_mirror_comp(lod_comp, lo, i) {
- if (!lu_extent_is_overlapped(&extent,
- &lod_comp->llc_extent))
- break;
-
- if (lod_comp_inited(lod_comp))
- continue;
-
- CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
- i, lod_comp_index(lo, lod_comp));
-
- info->lti_comp_idx[info->lti_count++] =
- lod_comp_index(lo, lod_comp);
- }
- }
-
+ rc = lod_prepare_resync(env, lo, &extent);
+ if (rc)
+ GOTO(out, rc);
/* change the file state to SYNC_PENDING */
lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
}
GOTO(out, rc = -EINVAL);
}
- if (!sync_components || !resync_components) {
- CDEBUG(D_LAYOUT, DFID": no mirror in sync or resync\n",
+ if (!sync_components || (mlc->mlc_resync_count && !resync_components)) {
+ CDEBUG(D_LAYOUT, DFID": no mirror in sync\n",
PFID(lod_object_fid(lo)));
/* tend to return an error code here to prevent