Whamcloud - gitweb
LU-10448 lod: pick primary mirror for write 11/30711/12
authorBobi Jam <bobijam.xu@intel.com>
Tue, 26 Dec 2017 10:16:40 +0000 (18:16 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 14 Feb 2018 00:51:18 +0000 (00:51 +0000)
As a mirrored file being written for the first time, MDS will choose
a mirror to write the data, a primary choosing policy function is
defined in this patch (lod_primary_pick()) to avoid the mirror with
unavailable OSTs.

Signed-off-by: Bobi Jam <bobijam.xu@intel.com>
Change-Id: I5d6d0459e96583294c3040a7994c33114be1e439
Reviewed-on: https://review.whamcloud.com/30711
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Fan Yong <fan.yong@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/lod/lod_internal.h
lustre/lod/lod_object.c
lustre/lod/lod_qos.c
lustre/tests/sanity-flr.sh

index 9cab1e3..022f7a9 100644 (file)
@@ -703,6 +703,7 @@ __u16 lod_comp_entry_stripe_count(struct lod_object *lo,
                                  bool is_dir);
 __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo,
                           __u16 stripe_count);
+void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod);
 
 /* lproc_lod.c */
 int lod_procfs_init(struct lod_device *lod);
index 9aff034..7a5697e 100644 (file)
@@ -5574,6 +5574,149 @@ static void lod_stale_components(struct lod_object *lo, int primary,
        }
 }
 
+/**
+ * check an OST's availability
+ * \param[in] env      execution environment
+ * \param[in] lo       lod object
+ * \param[in] dt       dt object
+ * \param[in] index    mirror index
+ *
+ * \retval     negative if failed
+ * \retval     1 if \a dt is available
+ * \retval     0 if \a dt is not available
+ */
+static inline int lod_check_ost_avail(const struct lu_env *env,
+                                     struct lod_object *lo,
+                                     struct dt_object *dt, int index)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_tgt_desc *ost;
+       __u32 idx;
+       int type = LU_SEQ_RANGE_OST;
+       int rc;
+
+       rc = lod_fld_lookup(env, lod, lu_object_fid(&dt->do_lu), &idx, &type);
+       if (rc < 0) {
+               CERROR("%s: can't locate "DFID":rc = %d\n",
+                      lod2obd(lod)->obd_name, PFID(lu_object_fid(&dt->do_lu)),
+                      rc);
+               return rc;
+       }
+
+       ost = OST_TGT(lod, idx);
+       if (ost->ltd_statfs.os_state &
+               (OS_STATE_READONLY | OS_STATE_ENOSPC | OS_STATE_ENOINO) ||
+           ost->ltd_active == 0) {
+               CDEBUG(D_LAYOUT, DFID ": mirror %d OST%d unavail, rc = %d\n",
+                      PFID(lod_object_fid(lo)), index, idx, rc);
+               return 0;
+       }
+
+       return 1;
+}
+
+/**
+ * Pick primary mirror for write
+ * \param[in] env      execution environment
+ * \param[in] lo       object
+ * \param[in] extent   write range
+ */
+static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
+                           struct lu_extent *extent)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       unsigned int seq = 0;
+       struct lod_layout_component *lod_comp;
+       int i, j, rc;
+       int picked = -1, second_pick = -1, third_pick = -1;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
+               get_random_bytes(&seq, sizeof(seq));
+               seq %= lo->ldo_mirror_count;
+       }
+
+       /**
+        * Pick a mirror as the primary, and check the availability of OSTs.
+        *
+        * This algo can be revised later after knowing the topology of
+        * cluster.
+        */
+       lod_qos_statfs_update(env, lod);
+       for (i = 0; i < lo->ldo_mirror_count; i++) {
+               bool ost_avail = true;
+               int index = (i + seq) % lo->ldo_mirror_count;
+
+               if (lo->ldo_mirrors[index].lme_stale) {
+                       CDEBUG(D_LAYOUT, DFID": mirror %d stale\n",
+                              PFID(lod_object_fid(lo)), index);
+                       continue;
+               }
+
+               /* 2nd pick is for the primary mirror containing unavail OST */
+               if (lo->ldo_mirrors[index].lme_primary && second_pick < 0)
+                       second_pick = index;
+
+               /* 3rd pick is for non-primary mirror containing unavail OST */
+               if (second_pick < 0 && third_pick < 0)
+                       third_pick = index;
+
+               /**
+                * we found a non-primary 1st pick, we'd like to find a
+                * potential pirmary mirror.
+                */
+               if (picked >= 0 && !lo->ldo_mirrors[index].lme_primary)
+                       continue;
+
+               /* check the availability of OSTs */
+               lod_foreach_mirror_comp(lod_comp, lo, index) {
+                       if (!lod_comp_inited(lod_comp) || !lod_comp->llc_stripe)
+                               continue;
+
+                       for (j = 0; j < lod_comp->llc_stripe_count; j++) {
+                               struct dt_object *dt = lod_comp->llc_stripe[j];
+
+                               rc = lod_check_ost_avail(env, lo, dt, index);
+                               if (rc < 0)
+                                       RETURN(rc);
+
+                               ost_avail = !!rc;
+                               if (!ost_avail)
+                                       break;
+                       } /* for all dt object in one component */
+                       if (!ost_avail)
+                               break;
+               } /* for all components in a mirror */
+
+               /**
+                * the OSTs where allocated objects locates in the components
+                * of the mirror are available.
+                */
+               if (!ost_avail)
+                       continue;
+
+               /* this mirror has all OSTs available */
+               picked = index;
+
+               /**
+                * primary with all OSTs are available, this is the perfect
+                * 1st pick.
+                */
+               if (lo->ldo_mirrors[index].lme_primary)
+                       break;
+       } /* for all mirrors */
+
+       /* failed to pick a sound mirror, lower our expectation */
+       if (picked < 0)
+               picked = second_pick;
+       if (picked < 0)
+               picked = third_pick;
+       if (picked < 0)
+               RETURN(-ENODATA);
+
+       RETURN(picked);
+}
+
 static int lod_declare_update_rdonly(const struct lu_env *env,
                struct lod_object *lo, struct md_layout_change *mlc,
                struct thandle *th)
@@ -5583,9 +5726,7 @@ static int lod_declare_update_rdonly(const struct lu_env *env,
        struct lod_layout_component *lod_comp;
        struct layout_intent *layout = mlc->mlc_intent;
        struct lu_extent extent = layout->li_extent;
-       unsigned int seq = 0;
        int picked;
-       int i;
        int rc;
        ENTRY;
 
@@ -5596,33 +5737,9 @@ static int lod_declare_update_rdonly(const struct lu_env *env,
        CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
               PFID(lod_object_fid(lo)), PEXT(&extent));
 
-       if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
-               get_random_bytes(&seq, sizeof(seq));
-               seq %= lo->ldo_mirror_count;
-       }
-
-       /**
-        * Pick a mirror as the primary.
-        * Now it only picks the first mirror that has primary flag set and
-        * doesn't have any stale components. This algo should be revised
-        * later after knowing the topology of cluster or the availability of
-        * OSTs.
-        */
-       for (picked = -1, i = 0; i < lo->ldo_mirror_count; i++) {
-               int index = (i + seq) % lo->ldo_mirror_count;
-
-               if (!lo->ldo_mirrors[index].lme_stale) {
-                       if (lo->ldo_mirrors[index].lme_primary) {
-                               picked = index;
-                               break;
-                       }
-
-                       if (picked < 0)
-                               picked = index;
-               }
-       }
-       if (picked < 0) /* failed to pick a primary */
-               RETURN(-ENODATA);
+       picked = lod_primary_pick(env, lo, &extent);
+       if (picked < 0)
+               RETURN(picked);
 
        CDEBUG(D_LAYOUT, DFID": picked mirror %u as primary\n",
               PFID(lod_object_fid(lo)), lo->ldo_mirrors[picked].lme_id);
@@ -5654,9 +5771,6 @@ static int lod_declare_update_rdonly(const struct lu_env *env,
                if (lod_comp_inited(lod_comp))
                        continue;
 
-               CDEBUG(D_LAYOUT, "instantiate: %u / %u\n",
-                      i, lod_comp_index(lo, lod_comp));
-
                info->lti_comp_idx[info->lti_count++] =
                                                lod_comp_index(lo, lod_comp);
        }
index 7101f54..a5b4717 100644 (file)
@@ -259,8 +259,7 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
  * \param[in] env      execution environment for this thread
  * \param[in] lod      LOD device
  */
-static void lod_qos_statfs_update(const struct lu_env *env,
-                                 struct lod_device *lod)
+void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod)
 {
        struct obd_device *obd = lod2obd(lod);
        struct ost_pool *osts = &(lod->lod_pool_info);
index 5eaf52c..4201887 100644 (file)
@@ -1340,8 +1340,9 @@ test_40() {
        for ops in "conv=notrunc" ""; do
                rm -f $tf
 
-               $LFS mirror create -N -E2m -E4m -E-1 -N -E1m -E2m -E4m -E-1 \
-                       $tf || error "create PFLR file $tf failed"
+               $LFS mirror create -N -E2m -E4m -E-1  --flags=prefer \
+                               -N -E1m -E2m -E4m -E-1 $tf ||
+                       error "create PFLR file $tf failed"
                dd if=/dev/zero of=$tf $ops bs=1M seek=2 count=1 ||
                        error "write PFLR file $tf failed"
 
@@ -1358,7 +1359,7 @@ test_40() {
                # the 2nd component (in mirror 1) should be inited
                verify_comp_attr lcme_flags $tf 0x10002 init
                # the 3rd component (in mirror 1) should be uninited
-               verify_comp_attr lcme_flags $tf 0x10003 0
+               verify_comp_attr lcme_flags $tf 0x10003 prefer
                # the 4th component (in mirror 2) should be inited
                verify_comp_attr lcme_flags $tf 0x20004 init
                # the 5th component (in mirror 2) should be uninited
@@ -1517,6 +1518,77 @@ test_42() {
 }
 run_test 42 "lfs mirror verify"
 
+# inactivate one OST && write && restore the OST
+write_file_43() {
+       local file=$1
+       local ost=$2
+       local PARAM="osc.${FSNAME}-OST000${ost}-osc-M*.active"
+       local wait
+
+       wait=$(do_facet $SINGLEMDS \
+               "$LCTL get_param -n lod.*MDT0000-*.qos_maxage")
+       wait=${wait%%[^0-9]*}
+
+       echo "deactivate OST$ost, waiting for $((wait*2)) seconds"
+       $(do_facet $SINGLEMDS "$LCTL set_param -n $PARAM 0")
+       # lod_qos_statfs_update needs 2*$wait seconds to refresh targets statfs
+       sleep $(($wait * 2))
+       echo "write $file"
+       dd if=/dev/zero of=$file bs=1M count=1 || error "write $file failed"
+       echo "restore activating OST$ost, waiting for $((wait*2)) seconds"
+       $(do_facet $SINGLEMDS "$LCTL set_param -n $PARAM 1")
+       sleep $((wait * 2))
+
+       local flags=$($LFS getstripe -v $file | awk '/lcm_flags:/ { print $2 }')
+       [ $flags = wp ] || error "file mirror state $flags != wp"
+}
+
+test_43() {
+       [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
+
+       local tf=$DIR/$tfile
+       local flags
+
+       rm -f $tf
+       ##   mirror 0  ost (0, 1)
+       ##   mirror 1  ost (1, 2)
+       ##   mirror 2  ost (2, 0)
+       $LFS mirror create -N -Eeof -c2 -o0,1 -N -Eeof -c2 -o1,2 \
+               -N -Eeof -c2 -o2,0 $tf ||
+               error "create 3 mirrors file $tf failed"
+
+       ################## OST0 ###########################################
+       write_file_43 $tf 0
+       verify_comp_attr lcme_flags $tf 0x10001 init,stale
+       verify_comp_attr lcme_flags $tf 0x20002 init
+       verify_comp_attr lcme_flags $tf 0x30003 init,stale
+
+       # resync
+       echo "resync $tf"
+       $LFS mirror resync $tf
+       flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }')
+       [ $flags = ro ] || error "file mirror state $flags != ro"
+
+       ################## OST1 ###########################################
+       write_file_43 $tf 1
+       verify_comp_attr lcme_flags $tf 0x10001 init,stale
+       verify_comp_attr lcme_flags $tf 0x20002 init,stale
+       verify_comp_attr lcme_flags $tf 0x30003 init
+
+       # resync
+       echo "resync $tf"
+       $LFS mirror resync $tf
+       flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }')
+       [ $flags = ro ] || error "file mirror state $flags != ro"
+
+       ################## OST2 ###########################################
+       write_file_43 $tf 2
+       verify_comp_attr lcme_flags $tf 0x10001 init
+       verify_comp_attr lcme_flags $tf 0x20002 init,stale
+       verify_comp_attr lcme_flags $tf 0x30003 init,stale
+}
+run_test 43 "mirror pick on write"
+
 test_44() {
        [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return
        rm -rf $DIR/$tdir