Whamcloud - gitweb
LU-10629 lod: Clear OST pool with setstripe
[fs/lustre-release.git] / lustre / lod / lod_object.c
index acfdd01..1fac134 100644 (file)
@@ -2339,7 +2339,7 @@ static int lod_declare_layout_add(const struct lu_env *env,
 
        LASSERT(lo->ldo_is_composite);
 
-       if (lo->ldo_flr_state != LCM_FL_NOT_FLR)
+       if (lo->ldo_flr_state != LCM_FL_NONE)
                RETURN(-EBUSY);
 
        rc = lod_verify_striping(d, lo, buf, false);
@@ -2549,7 +2549,7 @@ static int lod_declare_layout_del(const struct lu_env *env,
 
        LASSERT(lo->ldo_is_composite);
 
-       if (lo->ldo_flr_state != LCM_FL_NOT_FLR)
+       if (lo->ldo_flr_state != LCM_FL_NONE)
                RETURN(-EBUSY);
 
        magic = comp_v1->lcm_magic;
@@ -2764,7 +2764,7 @@ static int lod_layout_convert(struct lod_thread_info *info)
        lcm->lcm_size = cpu_to_le32(size);
        lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
                                                lmm_save->lmm_layout_gen));
-       lcm->lcm_flags = cpu_to_le16(LCM_FL_NOT_FLR);
+       lcm->lcm_flags = cpu_to_le16(LCM_FL_NONE);
        lcm->lcm_entry_count = cpu_to_le16(1);
        lcm->lcm_mirror_count = 0;
 
@@ -2913,7 +2913,7 @@ static int lod_declare_layout_merge(const struct lu_env *env,
        lcm->lcm_size = cpu_to_le32(size);
        lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
        lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
-       if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NOT_FLR)
+       if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NONE)
                lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
 
        LASSERT(dt_write_locked(env, dt_object_child(dt)));
@@ -3778,6 +3778,7 @@ static int lod_xattr_set(const struct lu_env *env,
                struct lod_default_striping *lds = &info->lti_def_striping;
                struct lov_user_md_v1 *v1 = buf->lb_buf;
                char pool[LOV_MAXPOOLNAME + 1];
+               bool is_del;
 
                /* get existing striping config */
                rc = lod_get_default_lov_striping(env, lod_dt_obj(dt), lds);
@@ -3790,8 +3791,14 @@ static int lod_xattr_set(const struct lu_env *env,
                                            lds->lds_def_comp_cnt, pool,
                                            sizeof(pool));
 
+               is_del = LOVEA_DELETE_VALUES(v1->lmm_stripe_size,
+                                            v1->lmm_stripe_count,
+                                            v1->lmm_stripe_offset,
+                                            NULL);
+
                /* Retain the pool name if it is not given */
-               if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0') {
+               if (v1->lmm_magic == LOV_USER_MAGIC_V1 && pool[0] != '\0' &&
+                       !is_del) {
                        struct lod_thread_info *info = lod_env_info(env);
                        struct lov_user_md_v3 *v3  = info->lti_ea_store;
 
@@ -5415,7 +5422,7 @@ static int lod_declare_update_plain(const struct lu_env *env,
        int i, rc;
        ENTRY;
 
-       LASSERT(lo->ldo_flr_state == LCM_FL_NOT_FLR);
+       LASSERT(lo->ldo_flr_state == LCM_FL_NONE);
 
        /*
         * In case the client is passing lovea, which only happens during
@@ -5453,15 +5460,6 @@ static int lod_declare_update_plain(const struct lu_env *env,
                        GOTO(out, rc);
        }
 
-       if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
-               /**
-                * trunc transfers [size, eof) in the intent extent, while
-                * we'd instantiated components covers [0, size).
-                */
-               layout->li_extent.e_end = layout->li_extent.e_start;
-               layout->li_extent.e_start = 0;
-       }
-
        /* Make sure defined layout covers the requested write range. */
        lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
        if (lo->ldo_comp_cnt > 1 &&
@@ -5583,27 +5581,62 @@ static void lod_stale_components(struct lod_object *lo, int primary,
        }
 }
 
-static int lod_declare_update_rdonly(const struct lu_env *env,
-               struct lod_object *lo, struct md_layout_change *mlc,
-               struct thandle *th)
+/**
+ * check an OST's availability
+ * \param[in] env      execution environment
+ * \param[in] lo       lod object
+ * \param[in] dt       dt object
+ * \param[in] index    mirror index
+ *
+ * \retval     negative if failed
+ * \retval     1 if \a dt is available
+ * \retval     0 if \a dt is not available
+ */
+static inline int lod_check_ost_avail(const struct lu_env *env,
+                                     struct lod_object *lo,
+                                     struct dt_object *dt, int index)
 {
-       struct lod_thread_info *info = lod_env_info(env);
-       struct lu_attr *layout_attr = &info->lti_layout_attr;
-       struct lod_layout_component *lod_comp;
-       struct layout_intent *layout = mlc->mlc_intent;
-       struct lu_extent extent = layout->li_extent;
-       unsigned int seq = 0;
-       int picked;
-       int i;
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_tgt_desc *ost;
+       __u32 idx;
+       int type = LU_SEQ_RANGE_OST;
        int rc;
-       ENTRY;
 
-       LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE);
-       LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
-       LASSERT(lo->ldo_mirror_count > 0);
+       rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
+       if (rc < 0) {
+               CERROR("%s: can't locate "DFID":rc = %d\n",
+                      lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
+                      rc);
+               return rc;
+       }
+
+       ost = OST_TGT(lod, idx);
+       if (ost->ltd_statfs.os_state &
+               (OS_STATE_READONLY | OS_STATE_ENOSPC | OS_STATE_ENOINO) ||
+           ost->ltd_active == 0) {
+               CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n",
+                      PFID(lod_object_fid(lo)), index, idx, rc);
+               return 0;
+       }
 
-       CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
-              PFID(lod_object_fid(lo)), PEXT(&extent));
+       return 1;
+}
+
+/**
+ * Pick primary mirror for write
+ * \param[in] env      execution environment
+ * \param[in] lo       object
+ * \param[in] extent   write range
+ */
+static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
+                           struct lu_extent *extent)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       unsigned int seq = 0;
+       struct lod_layout_component *lod_comp;
+       int i, j, rc;
+       int picked = -1, second_pick = -1, third_pick = -1;
+       ENTRY;
 
        if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
                get_random_bytes(&seq, sizeof(seq));
@@ -5611,61 +5644,221 @@ static int lod_declare_update_rdonly(const struct lu_env *env,
        }
 
        /**
-        * Pick a mirror as the primary.
-        * Now it only picks the first mirror that has primary flag set and
-        * doesn't have any stale components. This algo should be revised
-        * later after knowing the topology of cluster or the availability of
-        * OSTs.
+        * Pick a mirror as the primary, and check the availability of OSTs.
+        *
+        * This algo can be revised later after knowing the topology of
+        * cluster.
         */
-       for (picked = -1, i = 0; i < lo->ldo_mirror_count; i++) {
+       lod_qos_statfs_update(env, lod);
+       for (i = 0; i < lo->ldo_mirror_count; i++) {
+               bool ost_avail = true;
                int index = (i + seq) % lo->ldo_mirror_count;
 
-               if (!lo->ldo_mirrors[index].lme_stale) {
-                       if (lo->ldo_mirrors[index].lme_primary) {
-                               picked = index;
-                               break;
-                       }
-
-                       if (picked < 0)
-                               picked = index;
+               if (lo->ldo_mirrors[index].lme_stale) {
+                       CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
+                              PFID(lod_object_fid(lo)), index);
+                       continue;
                }
-       }
-       if (picked < 0) /* failed to pick a primary */
-               RETURN(-ENODATA);
 
-       CDEBUG(D_LAYOUT, DFID": picked mirror %u as primary\n",
-              PFID(lod_object_fid(lo)), lo->ldo_mirrors[picked].lme_id);
+               /* 2nd pick is for the primary mirror containing unavail OST */
+               if (lo->ldo_mirrors[index].lme_primary && second_pick < 0)
+                       second_pick = index;
 
-       /* stale overlapping components from other mirrors */
-       lod_stale_components(lo, picked, &extent);
+               /* 3rd pick is for non-primary mirror containing unavail OST */
+               if (second_pick < 0 && third_pick < 0)
+                       third_pick = index;
 
-       /* instantiate components for the picked mirror, start from 0 */
-       if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
                /**
-                * trunc transfers [size, eof) in the intent extent, we'd
-                * stale components overlapping [size, eof), while we'd
-                * instantiated components covers [0, size).
+                * we found a non-primary 1st pick, we'd like to find a
+                * potential pirmary mirror.
                 */
-               extent.e_end = extent.e_start;
-       }
-       extent.e_start = 0;
+               if (picked >= 0 && !lo->ldo_mirrors[index].lme_primary)
+                       continue;
+
+               /* check the availability of OSTs */
+               lod_foreach_mirror_comp(lod_comp, lo, index) {
+                       if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
+                               continue;
+
+                       for (j = 0; j < lod_comp->llc_stripe_count; j++) {
+                               struct dt_object *dt = lod_comp->llc_stripe[j];
+
+                               rc = lod_check_ost_avail(env, lo, dt, index);
+                               if (rc < 0)
+                                       RETURN(rc);
+
+                               ost_avail = !!rc;
+                               if (!ost_avail)
+                                       break;
+                       } /* for all dt object in one component */
+                       if (!ost_avail)
+                               break;
+               } /* for all components in a mirror */
 
-       lod_foreach_mirror_comp(lod_comp, lo, picked) {
-               if (!lu_extent_is_overlapped(&extent,
-                                            &lod_comp->llc_extent))
+               /**
+                * the OSTs where allocated objects locates in the components
+                * of the mirror are available.
+                */
+               if (!ost_avail)
+                       continue;
+
+               /* this mirror has all OSTs available */
+               picked = index;
+
+               /**
+                * primary with all OSTs are available, this is the perfect
+                * 1st pick.
+                */
+               if (lo->ldo_mirrors[index].lme_primary)
                        break;
+       } /* for all mirrors */
+
+       /* failed to pick a sound mirror, lower our expectation */
+       if (picked < 0)
+               picked = second_pick;
+       if (picked < 0)
+               picked = third_pick;
+       if (picked < 0)
+               RETURN(-ENODATA);
 
-               if (lod_comp_inited(lod_comp))
+       RETURN(picked);
+}
+
+/**
+ * figure out the components should be instantiated for resync.
+ */
+static int lod_prepare_resync(const struct lu_env *env, struct lod_object *lo,
+                             struct lu_extent *extent)
+{
+       struct lod_thread_info *info = lod_env_info(env);
+       struct lod_layout_component *lod_comp;
+       unsigned int need_sync = 0;
+       int i;
+
+       CDEBUG(D_LAYOUT,
+              DFID": instantiate all stale components in "DEXT"\n",
+              PFID(lod_object_fid(lo)), PEXT(extent));
+
+       /**
+        * instantiate all components within this extent, even non-stale
+        * components.
+        */
+       for (i = 0; i < lo->ldo_mirror_count; i++) {
+               if (!lo->ldo_mirrors[i].lme_stale)
                        continue;
 
-               CDEBUG(D_LAYOUT, "instantiate: %u / %u\n",
-                      i, lod_comp_index(lo, lod_comp));
+               lod_foreach_mirror_comp(lod_comp, lo, i) {
+                       if (!lu_extent_is_overlapped(extent,
+                                               &lod_comp->llc_extent))
+                               break;
 
-               info->lti_comp_idx[info->lti_count++] =
-                                               lod_comp_index(lo, lod_comp);
+                       need_sync++;
+
+                       if (lod_comp_inited(lod_comp))
+                               continue;
+
+                       CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
+                              i, lod_comp_index(lo, lod_comp));
+                       info->lti_comp_idx[info->lti_count++] =
+                                       lod_comp_index(lo, lod_comp);
+               }
        }
 
-       lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+       return need_sync ? 0 : -EALREADY;
+}
+
+static int lod_declare_update_rdonly(const struct lu_env *env,
+               struct lod_object *lo, struct md_layout_change *mlc,
+               struct thandle *th)
+{
+       struct lod_thread_info *info = lod_env_info(env);
+       struct lu_attr *layout_attr = &info->lti_layout_attr;
+       struct lod_layout_component *lod_comp;
+       struct lu_extent extent = { 0 };
+       int rc;
+       ENTRY;
+
+       LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
+       LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
+               mlc->mlc_opc == MD_LAYOUT_RESYNC);
+       LASSERT(lo->ldo_mirror_count > 0);
+
+       if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
+               struct layout_intent *layout = mlc->mlc_intent;
+               int picked;
+
+               extent = layout->li_extent;
+               CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
+                      PFID(lod_object_fid(lo)), PEXT(&extent));
+
+               picked = lod_primary_pick(env, lo, &extent);
+               if (picked < 0)
+                       RETURN(picked);
+
+               CDEBUG(D_LAYOUT, DFID": picked mirror id %u as primary\n",
+                      PFID(lod_object_fid(lo)),
+                      lo->ldo_mirrors[picked].lme_id);
+
+               if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
+                       /**
+                        * trunc transfers [0, size) in the intent extent, we'd
+                        * stale components overlapping [size, eof).
+                        */
+                       extent.e_start = extent.e_end;
+                       extent.e_end = OBD_OBJECT_EOF;
+               }
+
+               /* stale overlapping components from other mirrors */
+               lod_stale_components(lo, picked, &extent);
+
+               /* restore truncate intent extent */
+               if (layout->li_opc == LAYOUT_INTENT_TRUNC)
+                       extent.e_end = extent.e_start;
+
+               /* instantiate components for the picked mirror, start from 0 */
+               extent.e_start = 0;
+
+               lod_foreach_mirror_comp(lod_comp, lo, picked) {
+                       if (!lu_extent_is_overlapped(&extent,
+                                                    &lod_comp->llc_extent))
+                               break;
+
+                       if (lod_comp_inited(lod_comp))
+                               continue;
+
+                       info->lti_comp_idx[info->lti_count++] =
+                                               lod_comp_index(lo, lod_comp);
+               }
+
+               lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+       } else { /* MD_LAYOUT_RESYNC */
+               int i;
+
+               /**
+                * could contain multiple non-stale mirrors, so we need to
+                * prep uninited all components assuming any non-stale mirror
+                * could be picked as the primary mirror.
+                */
+               for (i = 0; i < lo->ldo_mirror_count; i++) {
+                       if (lo->ldo_mirrors[i].lme_stale)
+                               continue;
+
+                       lod_foreach_mirror_comp(lod_comp, lo, i) {
+                               if (!lod_comp_inited(lod_comp))
+                                       break;
+
+                               if (extent.e_end < lod_comp->llc_extent.e_end)
+                                       extent.e_end =
+                                               lod_comp->llc_extent.e_end;
+                       }
+               }
+
+               rc = lod_prepare_resync(env, lo, &extent);
+               if (rc)
+                       GOTO(out, rc);
+               /* change the file state to SYNC_PENDING */
+               lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
+       }
 
        /* Reset the layout version once it's becoming too large.
         * This way it can make sure that the layout version is
@@ -5684,6 +5877,8 @@ static int lod_declare_update_rdonly(const struct lu_env *env,
 
        layout_attr->la_valid = LA_LAYOUT_VERSION;
        layout_attr->la_layout_version = 0; /* set current version */
+       if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
+               layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
        rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
        if (rc)
                GOTO(out, rc);
@@ -5751,19 +5946,23 @@ static int lod_declare_update_write_pending(const struct lu_env *env,
                CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
                       PFID(lod_object_fid(lo)), PEXT(&extent));
 
+               if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC) {
+                       /**
+                        * trunc transfers [0, size) in the intent extent, we'd
+                        * stale components overlapping [size, eof).
+                        */
+                       extent.e_start = extent.e_end;
+                       extent.e_end = OBD_OBJECT_EOF;
+               }
                /* 1. stale overlapping components */
                lod_stale_components(lo, primary, &extent);
 
                /* 2. find out the components need instantiating.
                 * instantiate [0, mlc->mlc_intent->e_end) */
-               if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC) {
-                       /**
-                        * trunc transfers [size, eof) in the intent extent,
-                        * we'd stale components overlapping [size, eof),
-                        * while we'd instantiated components covers [0, size).
-                        */
+
+               /* restore truncate intent extent */
+               if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC)
                        extent.e_end = extent.e_start;
-               }
                extent.e_start = 0;
 
                lod_foreach_mirror_comp(lod_comp, lo, primary) {
@@ -5780,9 +5979,6 @@ static int lod_declare_update_write_pending(const struct lu_env *env,
                                                lod_comp_index(lo, lod_comp);
                }
        } else { /* MD_LAYOUT_RESYNC */
-               /* figure out the components that have been instantiated in
-                * in primary to decide what components should be instantiated
-                * in stale mirrors */
                lod_foreach_mirror_comp(lod_comp, lo, primary) {
                        if (!lod_comp_inited(lod_comp))
                                break;
@@ -5790,36 +5986,9 @@ static int lod_declare_update_write_pending(const struct lu_env *env,
                        extent.e_end = lod_comp->llc_extent.e_end;
                }
 
-               CDEBUG(D_LAYOUT,
-                      DFID": instantiate all stale components in "DEXT"\n",
-                      PFID(lod_object_fid(lo)), PEXT(&extent));
-
-               /* 1. instantiate all components within this extent, even
-                * non-stale components so that it won't need to instantiate
-                * those components for mirror truncate later. */
-               for (i = 0; i < lo->ldo_mirror_count; i++) {
-                       if (primary == i)
-                               continue;
-
-                       LASSERTF(lo->ldo_mirrors[i].lme_stale,
-                                "both %d and %d are primary\n", i, primary);
-
-                       lod_foreach_mirror_comp(lod_comp, lo, i) {
-                               if (!lu_extent_is_overlapped(&extent,
-                                                       &lod_comp->llc_extent))
-                                       break;
-
-                               if (lod_comp_inited(lod_comp))
-                                       continue;
-
-                               CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
-                                      i, lod_comp_index(lo, lod_comp));
-
-                               info->lti_comp_idx[info->lti_count++] =
-                                               lod_comp_index(lo, lod_comp);
-                       }
-               }
-
+               rc = lod_prepare_resync(env, lo, &extent);
+               if (rc)
+                       GOTO(out, rc);
                /* change the file state to SYNC_PENDING */
                lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
        }
@@ -5909,8 +6078,8 @@ static int lod_declare_update_sync_pending(const struct lu_env *env,
                GOTO(out, rc = -EINVAL);
        }
 
-       if (!sync_components || !resync_components) {
-               CDEBUG(D_LAYOUT, DFID": no mirror in sync or resync\n",
+       if (!sync_components || (mlc->mlc_resync_count && !resync_components)) {
+               CDEBUG(D_LAYOUT, DFID": no mirror in sync\n",
                       PFID(lod_object_fid(lo)));
 
                /* tend to return an error code here to prevent
@@ -5961,7 +6130,7 @@ static int lod_declare_layout_change(const struct lu_env *env,
                GOTO(out, rc);
 
        switch (lo->ldo_flr_state) {
-       case LCM_FL_NOT_FLR:
+       case LCM_FL_NONE:
                rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
                                              &mlc->mlc_buf, th);
                break;